package eu.dnetlib.pace.config; import java.io.IOException; import java.io.Serializable; import java.nio.charset.StandardCharsets; import java.util.AbstractMap; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.function.Predicate; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import java.util.stream.Collectors; import org.antlr.stringtemplate.StringTemplate; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Maps; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.util.PaceException; public class DedupConfig implements Config, Serializable { private static String CONFIG_TEMPLATE = "dedupConfig.st"; private PaceConfig pace; private WfConfig wf; @JsonIgnore private Map> blacklists; private static Map defaults = Maps.newHashMap(); static { defaults.put("dedupRun", "001"); defaults.put("entityType", "result"); defaults.put("subEntityType", "resulttype"); defaults.put("subEntityValue", "publication"); defaults.put("orderField", "title"); defaults.put("queueMaxSize", "2000"); defaults.put("groupMaxSize", "10"); defaults.put("slidingWindowSize", "200"); defaults.put("rootBuilder", "result"); defaults.put("includeChildren", "true"); defaults.put("maxIterations", "20"); defaults.put("idPath", "$.id"); } public DedupConfig() { } public static DedupConfig load(final String json) { final DedupConfig config; try { config = new ObjectMapper().readValue(json, DedupConfig.class); config.getPace().initModel(); config.getPace().initTranslationMap(); config.blacklists = config .getPace() .getBlacklists() .entrySet() .stream() .map( e -> new AbstractMap.SimpleEntry>(e.getKey(), e .getValue() .stream() .filter(s -> !StringUtils.isBlank(s)) .map(Pattern::compile) .collect(Collectors.toList()))) .collect( Collectors .toMap( e -> e.getKey(), e -> (Predicate & Serializable) s -> e .getValue() .stream() .filter(p -> p.matcher(s).matches()) .findFirst() .isPresent())) ; return config; } catch (IOException | PatternSyntaxException e) { throw new PaceException("Error in parsing configuration json", e); } } public static DedupConfig loadDefault() throws IOException { return loadDefault(new HashMap()); } public static DedupConfig loadDefault(final Map params) throws IOException { final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE)); for (final Entry e : defaults.entrySet()) { template.setAttribute(e.getKey(), e.getValue()); } for (final Entry e : params.entrySet()) { if (template.getAttribute(e.getKey()) != null) { template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue()); } else { template.setAttribute(e.getKey(), e.getValue()); } } final String json = template.toString(); return load(json); } private String readFromClasspath(final String resource) throws IOException { return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8); } public PaceConfig getPace() { return pace; } public void setPace(final PaceConfig pace) { this.pace = pace; } public WfConfig getWf() { return wf; } public void setWf(final WfConfig wf) { this.wf = wf; } @Override public String toString() { try { return new ObjectMapper().writeValueAsString(this); } catch (IOException e) { throw new PaceException("unable to serialise configuration", e); } } @Override public Map decisionTree() { return getPace().getDecisionTree(); } @Override public List model() { return getPace().getModel(); } @Override public List clusterings() { return getPace().getClustering(); } @Override public Map> blacklists() { return blacklists; } @Override public Map translationMap() { return getPace().translationMap(); } }