2018-10-02 10:37:54 +02:00
|
|
|
|
2023-07-06 10:28:53 +02:00
|
|
|
package eu.dnetlib.pace.config;
|
2019-12-05 14:14:25 +01:00
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
import java.io.IOException;
|
2018-10-02 17:07:17 +02:00
|
|
|
import java.io.Serializable;
|
2023-06-16 09:41:11 +02:00
|
|
|
import java.nio.charset.StandardCharsets;
|
2023-07-18 11:38:56 +02:00
|
|
|
import java.util.AbstractMap;
|
2018-10-02 10:37:54 +02:00
|
|
|
import java.util.HashMap;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Map.Entry;
|
2023-07-18 11:38:56 +02:00
|
|
|
import java.util.function.Predicate;
|
2023-06-16 09:41:11 +02:00
|
|
|
import java.util.regex.Pattern;
|
2023-07-18 11:38:56 +02:00
|
|
|
import java.util.regex.PatternSyntaxException;
|
2023-06-16 09:41:11 +02:00
|
|
|
import java.util.stream.Collectors;
|
2018-10-02 10:37:54 +02:00
|
|
|
|
2023-07-06 10:28:53 +02:00
|
|
|
import org.antlr.stringtemplate.StringTemplate;
|
|
|
|
import org.apache.commons.io.IOUtils;
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
2018-10-02 10:37:54 +02:00
|
|
|
|
2023-07-06 10:28:53 +02:00
|
|
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
|
import com.google.common.collect.Maps;
|
2018-10-02 10:37:54 +02:00
|
|
|
|
2023-07-06 10:28:53 +02:00
|
|
|
import eu.dnetlib.pace.model.ClusteringDef;
|
|
|
|
import eu.dnetlib.pace.model.FieldDef;
|
|
|
|
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
|
|
|
import eu.dnetlib.pace.util.PaceException;
|
2018-10-02 10:37:54 +02:00
|
|
|
|
2018-10-02 17:07:17 +02:00
|
|
|
public class DedupConfig implements Config, Serializable {
|
2018-10-02 10:37:54 +02:00
|
|
|
private static String CONFIG_TEMPLATE = "dedupConfig.st";
|
|
|
|
|
|
|
|
private PaceConfig pace;
|
|
|
|
|
|
|
|
private WfConfig wf;
|
|
|
|
|
2023-06-16 09:41:11 +02:00
|
|
|
@JsonIgnore
|
2023-07-18 11:38:56 +02:00
|
|
|
private Map<String, Predicate<String>> blacklists;
|
2023-06-16 09:41:11 +02:00
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
private static Map<String, String> defaults = Maps.newHashMap();
|
|
|
|
|
|
|
|
static {
|
2018-10-29 11:13:55 +01:00
|
|
|
defaults.put("dedupRun", "001");
|
2018-10-02 10:37:54 +02:00
|
|
|
defaults.put("entityType", "result");
|
2018-11-15 16:52:56 +01:00
|
|
|
defaults.put("subEntityType", "resulttype");
|
|
|
|
defaults.put("subEntityValue", "publication");
|
2018-10-02 10:37:54 +02:00
|
|
|
defaults.put("orderField", "title");
|
|
|
|
defaults.put("queueMaxSize", "2000");
|
|
|
|
defaults.put("groupMaxSize", "10");
|
|
|
|
defaults.put("slidingWindowSize", "200");
|
|
|
|
defaults.put("rootBuilder", "result");
|
|
|
|
defaults.put("includeChildren", "true");
|
2019-11-07 12:47:12 +01:00
|
|
|
defaults.put("maxIterations", "20");
|
2020-01-14 10:42:43 +01:00
|
|
|
defaults.put("idPath", "$.id");
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
2023-07-06 10:28:53 +02:00
|
|
|
public DedupConfig() {
|
|
|
|
}
|
2018-10-02 10:37:54 +02:00
|
|
|
|
|
|
|
public static DedupConfig load(final String json) {
|
|
|
|
|
2018-10-29 11:13:55 +01:00
|
|
|
final DedupConfig config;
|
|
|
|
try {
|
|
|
|
config = new ObjectMapper().readValue(json, DedupConfig.class);
|
|
|
|
config.getPace().initModel();
|
2019-10-08 14:53:52 +02:00
|
|
|
config.getPace().initTranslationMap();
|
2023-06-16 09:41:11 +02:00
|
|
|
|
2023-07-06 10:28:53 +02:00
|
|
|
config.blacklists = config
|
|
|
|
.getPace()
|
|
|
|
.getBlacklists()
|
|
|
|
.entrySet()
|
|
|
|
.stream()
|
2023-07-18 11:38:56 +02:00
|
|
|
.map(
|
|
|
|
e -> new AbstractMap.SimpleEntry<String, List<Pattern>>(e.getKey(),
|
|
|
|
e
|
|
|
|
.getValue()
|
|
|
|
.stream()
|
|
|
|
.filter(s -> !StringUtils.isBlank(s))
|
|
|
|
.map(Pattern::compile)
|
|
|
|
.collect(Collectors.toList())))
|
2023-07-06 10:28:53 +02:00
|
|
|
.collect(
|
|
|
|
Collectors
|
|
|
|
.toMap(
|
|
|
|
e -> e.getKey(),
|
2023-07-18 11:38:56 +02:00
|
|
|
e -> (Predicate<String> & Serializable) s -> e
|
2023-07-06 10:28:53 +02:00
|
|
|
.getValue()
|
|
|
|
.stream()
|
2023-07-18 11:38:56 +02:00
|
|
|
.filter(p -> p.matcher(s).matches())
|
|
|
|
.findFirst()
|
|
|
|
.isPresent()))
|
|
|
|
|
|
|
|
;
|
2023-06-16 09:41:11 +02:00
|
|
|
|
2018-10-29 11:13:55 +01:00
|
|
|
return config;
|
2023-07-18 11:38:56 +02:00
|
|
|
} catch (IOException | PatternSyntaxException e) {
|
2018-10-29 11:13:55 +01:00
|
|
|
throw new PaceException("Error in parsing configuration json", e);
|
|
|
|
}
|
2018-10-02 10:37:54 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
public static DedupConfig loadDefault() throws IOException {
|
|
|
|
return loadDefault(new HashMap<String, String>());
|
|
|
|
}
|
|
|
|
|
|
|
|
public static DedupConfig loadDefault(final Map<String, String> params) throws IOException {
|
|
|
|
|
|
|
|
final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
|
|
|
|
|
|
|
|
for (final Entry<String, String> e : defaults.entrySet()) {
|
|
|
|
template.setAttribute(e.getKey(), e.getValue());
|
|
|
|
}
|
|
|
|
for (final Entry<String, String> e : params.entrySet()) {
|
2019-02-17 12:48:25 +01:00
|
|
|
if (template.getAttribute(e.getKey()) != null) {
|
|
|
|
template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue());
|
|
|
|
} else {
|
|
|
|
template.setAttribute(e.getKey(), e.getValue());
|
|
|
|
}
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
final String json = template.toString();
|
|
|
|
return load(json);
|
|
|
|
}
|
|
|
|
|
|
|
|
private String readFromClasspath(final String resource) throws IOException {
|
2023-06-16 09:41:11 +02:00
|
|
|
return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
public PaceConfig getPace() {
|
|
|
|
return pace;
|
|
|
|
}
|
|
|
|
|
|
|
|
public void setPace(final PaceConfig pace) {
|
|
|
|
this.pace = pace;
|
|
|
|
}
|
|
|
|
|
|
|
|
public WfConfig getWf() {
|
|
|
|
return wf;
|
|
|
|
}
|
|
|
|
|
|
|
|
public void setWf(final WfConfig wf) {
|
|
|
|
this.wf = wf;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public String toString() {
|
2018-10-26 14:55:59 +02:00
|
|
|
try {
|
2018-11-12 15:52:18 +01:00
|
|
|
return new ObjectMapper().writeValueAsString(this);
|
2018-10-26 14:55:59 +02:00
|
|
|
} catch (IOException e) {
|
2018-11-12 15:52:18 +01:00
|
|
|
throw new PaceException("unable to serialise configuration", e);
|
2018-10-26 14:55:59 +02:00
|
|
|
}
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
2019-08-09 10:08:34 +02:00
|
|
|
@Override
|
2023-07-06 10:28:53 +02:00
|
|
|
public Map<String, TreeNodeDef> decisionTree() {
|
2019-08-09 10:08:34 +02:00
|
|
|
return getPace().getDecisionTree();
|
|
|
|
}
|
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
@Override
|
|
|
|
public List<FieldDef> model() {
|
|
|
|
return getPace().getModel();
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public List<ClusteringDef> clusterings() {
|
|
|
|
return getPace().getClustering();
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2023-07-18 11:38:56 +02:00
|
|
|
public Map<String, Predicate<String>> blacklists() {
|
2023-06-16 09:41:11 +02:00
|
|
|
return blacklists;
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
2019-10-08 14:53:52 +02:00
|
|
|
@Override
|
|
|
|
public Map<String, String> translationMap() {
|
|
|
|
return getPace().translationMap();
|
|
|
|
}
|
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|