forked from antonis.lempesis/dnet-hadoop
DedupConf parsed using Jackson library
This commit is contained in:
parent
8b4762bf54
commit
ea36007d1f
|
@ -1,24 +0,0 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.reflections.Reflections;
|
||||
|
||||
public class ClusteringResolver implements Serializable {
|
||||
private final Map<String, Class<ClusteringFunction>> functionMap;
|
||||
|
||||
public ClusteringResolver() {
|
||||
|
||||
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ClusteringClass.class).stream()
|
||||
.filter(ClusteringFunction.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
|
||||
}
|
||||
|
||||
public ClusteringFunction resolve(String clusteringFunction, Map<String, Integer> params) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
|
||||
|
||||
return functionMap.get(clusteringFunction).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import org.reflections.Reflections;
|
||||
|
||||
public class ConditionResolver implements Serializable {
|
||||
private final Map<String, Class<ConditionAlgo>> functionMap;
|
||||
|
||||
public ConditionResolver() {
|
||||
|
||||
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ConditionClass.class).stream()
|
||||
.filter(ConditionAlgo.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
|
||||
}
|
||||
|
||||
public ConditionAlgo resolve(String name, List<FieldDef> fields) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
|
||||
return functionMap.get(name).getDeclaredConstructor(String.class, List.class).newInstance(name, fields);
|
||||
}
|
||||
}
|
|
@ -7,12 +7,11 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.antlr.stringtemplate.StringTemplate;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
|
||||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
|
@ -35,7 +34,7 @@ public class DedupConfig implements Config, Serializable {
|
|||
|
||||
static {
|
||||
defaults.put("threshold", "0");
|
||||
defaults.put("run", "001");
|
||||
defaults.put("dedupRun", "001");
|
||||
defaults.put("entityType", "result");
|
||||
defaults.put("orderField", "title");
|
||||
defaults.put("queueMaxSize", "2000");
|
||||
|
@ -49,11 +48,15 @@ public class DedupConfig implements Config, Serializable {
|
|||
|
||||
public static DedupConfig load(final String json) {
|
||||
|
||||
final DedupConfig config = new Gson().fromJson(json, DedupConfig.class);
|
||||
|
||||
final DedupConfig config;
|
||||
try {
|
||||
config = new ObjectMapper().readValue(json, DedupConfig.class);
|
||||
config.getPace().initModel();
|
||||
|
||||
return config;
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Error in parsing configuration json", e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static DedupConfig loadDefault() throws IOException {
|
||||
|
|
|
@ -11,6 +11,7 @@ import eu.dnetlib.pace.condition.ConditionAlgo;
|
|||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.CondDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.util.PaceResolver;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
|
||||
public class PaceConfig implements Serializable {
|
||||
|
@ -23,6 +24,8 @@ public class PaceConfig implements Serializable {
|
|||
|
||||
private Map<String, FieldDef> modelMap;
|
||||
|
||||
public static PaceResolver paceResolver;
|
||||
|
||||
public PaceConfig() {}
|
||||
|
||||
public void initModel() {
|
||||
|
@ -30,6 +33,8 @@ public class PaceConfig implements Serializable {
|
|||
for(FieldDef fd : getModel()) {
|
||||
modelMap.put(fd.getName(), fd);
|
||||
}
|
||||
|
||||
paceResolver = new PaceResolver();
|
||||
}
|
||||
|
||||
public List<FieldDef> getModel() {
|
||||
|
|
|
@ -1,24 +0,0 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.reflections.Reflections;
|
||||
|
||||
public class DistanceResolver implements Serializable {
|
||||
private final Map<String, Class<DistanceAlgo>> functionMap;
|
||||
|
||||
public DistanceResolver() {
|
||||
|
||||
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream()
|
||||
.filter(DistanceAlgo.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
|
||||
}
|
||||
|
||||
public DistanceAlgo resolve(String algo, Map<String, Number> params) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
|
||||
|
||||
return functionMap.get(algo).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
}
|
||||
}
|
|
@ -1,12 +1,14 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import eu.dnetlib.pace.clustering.*;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
public class ClusteringDef implements Serializable {
|
||||
|
||||
|
@ -16,8 +18,6 @@ public class ClusteringDef implements Serializable {
|
|||
|
||||
private Map<String, Integer> params;
|
||||
|
||||
private ClusteringResolver clusteringResolver = new ClusteringResolver();
|
||||
|
||||
public ClusteringDef() {}
|
||||
|
||||
public String getName() {
|
||||
|
@ -29,12 +29,11 @@ public class ClusteringDef implements Serializable {
|
|||
}
|
||||
|
||||
public ClusteringFunction getClusteringFunction() {
|
||||
|
||||
try {
|
||||
return clusteringResolver.resolve(getName(), params);
|
||||
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
|
||||
return PaceConfig.paceResolver.getClusteringFunction(getName(), params);
|
||||
} catch (PaceException e) {
|
||||
e.printStackTrace();
|
||||
return new RandomClusteringFunction(getParams());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -56,7 +55,11 @@ public class ClusteringDef implements Serializable {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
return e.getStackTrace().toString();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import eu.dnetlib.pace.condition.*;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
public class CondDef implements Serializable {
|
||||
|
||||
|
@ -13,19 +14,10 @@ public class CondDef implements Serializable {
|
|||
|
||||
private List<String> fields;
|
||||
|
||||
private ConditionResolver conditionResolver = new ConditionResolver();
|
||||
|
||||
public CondDef() {}
|
||||
|
||||
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields) {
|
||||
|
||||
try {
|
||||
return conditionResolver.resolve(getName(), fields);
|
||||
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
|
||||
e.printStackTrace();
|
||||
return new AlwaysTrueCondition(getName(), fields);
|
||||
}
|
||||
|
||||
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields){
|
||||
return PaceConfig.paceResolver.getConditionAlgo(getName(), fields);
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
|
@ -46,7 +38,11 @@ public class CondDef implements Serializable {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
return e.getStackTrace().toString();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,9 +9,11 @@ import java.util.Map;
|
|||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.gson.Gson;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.distance.*;
|
||||
import eu.dnetlib.pace.distance.algo.*;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
/**
|
||||
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm.
|
||||
|
@ -38,8 +40,6 @@ public class FieldDef implements Serializable {
|
|||
|
||||
private Map<String, Number> params;
|
||||
|
||||
private DistanceResolver distanceResolver = new DistanceResolver();
|
||||
|
||||
public FieldDef() {}
|
||||
|
||||
// def apply(s: String): Field[A]
|
||||
|
@ -70,18 +70,12 @@ public class FieldDef implements Serializable {
|
|||
|
||||
public DistanceAlgo getDistanceAlgo() {
|
||||
|
||||
try {
|
||||
if (params == null) {
|
||||
params = new HashMap<>();
|
||||
}
|
||||
params.put("limit", getLimit());
|
||||
params.put("weight", getWeight());
|
||||
return distanceResolver.resolve(getAlgo(), params);
|
||||
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
|
||||
e.printStackTrace();
|
||||
return new NullDistanceAlgo(params);
|
||||
}
|
||||
|
||||
return PaceConfig.paceResolver.getDistanceAlgo(getAlgo(), params);
|
||||
}
|
||||
|
||||
public boolean isIgnoreMissing() {
|
||||
|
|
|
@ -13,7 +13,6 @@ import com.google.common.collect.Maps;
|
|||
import com.google.common.collect.Ordering;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import eu.dnetlib.pace.model.adaptor.PidOafSerialiser;
|
||||
|
||||
public class GTAuthor implements Comparable<GTAuthor> {
|
||||
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class ScoredResult extends Result {
|
||||
|
||||
|
@ -20,7 +22,11 @@ public class ScoredResult extends Result {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
return e.getStackTrace().toString();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
public class PaceException extends RuntimeException {
|
||||
|
||||
public PaceException(String s, Throwable e){
|
||||
super(s, e);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
import eu.dnetlib.pace.clustering.ClusteringClass;
|
||||
import eu.dnetlib.pace.clustering.ClusteringFunction;
|
||||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||
import eu.dnetlib.pace.condition.ConditionClass;
|
||||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import org.reflections.Reflections;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class PaceResolver implements Serializable {
|
||||
|
||||
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
|
||||
private final Map<String, Class<ConditionAlgo>> conditionAlgos;
|
||||
private final Map<String, Class<DistanceAlgo>> distanceAlgos;
|
||||
|
||||
public PaceResolver() {
|
||||
|
||||
this.clusteringFunctions = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ClusteringClass.class).stream()
|
||||
.filter(ClusteringFunction.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
|
||||
|
||||
this.conditionAlgos = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ConditionClass.class).stream()
|
||||
.filter(ConditionAlgo.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
|
||||
|
||||
this.distanceAlgos = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream()
|
||||
.filter(DistanceAlgo.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
|
||||
}
|
||||
|
||||
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
|
||||
try {
|
||||
return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
|
||||
throw new PaceException(name + "not found", e);
|
||||
}
|
||||
}
|
||||
|
||||
public DistanceAlgo getDistanceAlgo(String name, Map<String, Number> params) throws PaceException {
|
||||
try {
|
||||
return distanceAlgos.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
|
||||
throw new PaceException(name + "not found", e);
|
||||
}
|
||||
}
|
||||
|
||||
public ConditionAlgo getConditionAlgo(String name, List<FieldDef> fields) throws PaceException {
|
||||
try {
|
||||
return conditionAlgos.get(name).getDeclaredConstructor(String.class, List.class).newInstance(name, fields);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
|
||||
throw new PaceException(name + "not found", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,30 +0,0 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
public class ClusteringResolverTest {
|
||||
|
||||
private ClusteringResolver clusteringResolver;
|
||||
private Map<String,Integer> params = new HashMap<String, Integer>();
|
||||
|
||||
@Before
|
||||
public void setUp(){
|
||||
clusteringResolver = new ClusteringResolver();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testResolve() throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
|
||||
|
||||
ClusteringFunction ngrams = clusteringResolver.resolve("ngrams", params);
|
||||
|
||||
assertEquals(ngrams.getClass(), Ngrams.class);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import eu.dnetlib.pace.clustering.ClusteringFunction;
|
||||
import eu.dnetlib.pace.clustering.ClusteringResolver;
|
||||
import eu.dnetlib.pace.clustering.Ngrams;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
public class ConditionResolverTest {
|
||||
|
||||
private ConditionResolver conditionResolver;
|
||||
private List<FieldDef> fields;
|
||||
private String name;
|
||||
|
||||
@Before
|
||||
public void setUp(){
|
||||
conditionResolver = new ConditionResolver();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testResolve() throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
|
||||
|
||||
ConditionAlgo sizeMatch = conditionResolver.resolve("sizeMatch", fields);
|
||||
|
||||
assertEquals(sizeMatch.getClass(), SizeMatch.class);
|
||||
}
|
||||
}
|
|
@ -76,8 +76,16 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
final DedupConfig cfgFromSerialization = DedupConfig.load(cfgFromClasspath.toString());
|
||||
String params = "\"params\":{\"limit\":-1,\"weight\":0.0}";
|
||||
//verify if the serialization produces the same result of the input json
|
||||
// assertEquals(cfgFromSerialization.toString().replaceAll("[\n\t\r ]", "").replaceAll("\"params\":null", params), cfgFromClasspath.toString().replaceAll("[\n\t\r ]", ""));
|
||||
assertEquals(cfgFromSerialization.toString().replaceAll("[\n\t\r ]", "").replaceAll("\"params\":null", params), cfgFromClasspath.toString().replaceAll("[\n\t\r ]", ""));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void dedupConfigTest() {
|
||||
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
|
||||
|
||||
System.out.println(load.toString());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.99",
|
||||
"run" : "001",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "result",
|
||||
"orderField" : "title",
|
||||
"queueMaxSize" : "2000",
|
||||
|
|
Loading…
Reference in New Issue