imported cnr-data-utility-cleaner-rmi and cnr-data-utility-cleaner-service in dnet-data-services
This commit is contained in:
parent
22d126ffa8
commit
337b42e945
|
@ -28,6 +28,11 @@
|
|||
<artifactId>commons-beanutils</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.codehaus.groovy</groupId>
|
||||
<artifactId>groovy</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.json</groupId>
|
||||
<artifactId>json</artifactId>
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
package eu.dnetlib.data.utility.cleaner;
|
||||
|
||||
import javax.xml.ws.wsaddressing.W3CEndpointReference;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
import eu.dnetlib.data.utility.cleaner.rmi.CleanerException;
|
||||
import eu.dnetlib.data.utility.cleaner.rmi.CleanerService;
|
||||
import eu.dnetlib.enabling.resultset.MappedResultSetFactory;
|
||||
import eu.dnetlib.enabling.tools.AbstractBaseService;
|
||||
|
||||
public class CleanerServiceImpl extends AbstractBaseService implements CleanerService {
|
||||
|
||||
private CleaningRuleFactory cleaningRuleFactory;
|
||||
|
||||
private MappedResultSetFactory mappedResultSetFactory;
|
||||
|
||||
@Override
|
||||
public W3CEndpointReference clean(final W3CEndpointReference epr, final String ruleId) throws CleanerException {
|
||||
if ((ruleId == null) || (ruleId.isEmpty())) { throw new CleanerException("Invalid ruleId: id is empty"); }
|
||||
if (epr == null) { throw new CleanerException("Passed epr is empty"); }
|
||||
|
||||
return mappedResultSetFactory.createMappedResultSet(epr, cleaningRuleFactory.obtainCleaningRule(ruleId));
|
||||
}
|
||||
|
||||
@Required
|
||||
public MappedResultSetFactory getMappedResultSetFactory() {
|
||||
return mappedResultSetFactory;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setMappedResultSetFactory(final MappedResultSetFactory mappedResultSetFactory) {
|
||||
this.mappedResultSetFactory = mappedResultSetFactory;
|
||||
}
|
||||
|
||||
public CleaningRuleFactory getCleaningRuleFactory() {
|
||||
return cleaningRuleFactory;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setCleaningRuleFactory(final CleaningRuleFactory cleaningRuleFactory) {
|
||||
this.cleaningRuleFactory = cleaningRuleFactory;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package eu.dnetlib.data.utility.cleaner;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.Namespace;
|
||||
import org.dom4j.QName;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
import eu.dnetlib.miscutils.functional.UnaryFunction;
|
||||
|
||||
public class CleaningRule implements UnaryFunction<String, String> {
|
||||
|
||||
private static final Log log = LogFactory.getLog(CleaningRule.class); // NOPMD by marko on 11/24/08 5:02 PM
|
||||
|
||||
private List<XPATHCleaningRule> xpathRules = new ArrayList<XPATHCleaningRule>();
|
||||
|
||||
@Override
|
||||
public String evaluate(final String text) {
|
||||
|
||||
try {
|
||||
final List<Map<String, String>> errors = new ArrayList<Map<String, String>>();
|
||||
final Document doc = (new SAXReader()).read(new StringReader(text));
|
||||
for (final XPATHCleaningRule r : xpathRules) {
|
||||
errors.addAll(r.applyXpathRule(doc));
|
||||
}
|
||||
if (errors.size() > 0) {
|
||||
markAsInvalid(doc, errors);
|
||||
}
|
||||
return doc.asXML();
|
||||
} catch (final Exception e) {
|
||||
log.error("Error evaluating rule", e);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private void markAsInvalid(final Document doc, final List<Map<String, String>> errors) {
|
||||
final Element element = (Element) doc.selectSingleNode("//*[local-name()='header']");
|
||||
if (element != null) {
|
||||
final Element inv = element.addElement(new QName("invalid", new Namespace("dri", "http://www.driver-repository.eu/namespace/dri")));
|
||||
for (final Map<String, String> e : errors) {
|
||||
final Element err = inv.addElement(new QName("error", new Namespace("dri", "http://www.driver-repository.eu/namespace/dri")));
|
||||
for (final Map.Entry<String, String> entry : e.entrySet()) {
|
||||
err.addAttribute(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
inv.addAttribute("value", "true");
|
||||
}
|
||||
}
|
||||
|
||||
public List<XPATHCleaningRule> getXpathRules() {
|
||||
return xpathRules;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setXpathRules(final List<XPATHCleaningRule> xpathRules) {
|
||||
this.xpathRules = xpathRules;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
package eu.dnetlib.data.utility.cleaner;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.data.utility.cleaner.rmi.CleanerException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.enabling.locators.UniqueServiceLocator;
|
||||
|
||||
public class CleaningRuleFactory {
|
||||
|
||||
private UniqueServiceLocator serviceLocator;
|
||||
|
||||
public CleaningRule obtainCleaningRule(final String ruleId) throws CleanerException {
|
||||
try {
|
||||
final String prof = serviceLocator.getService(ISLookUpService.class).getResourceProfileByQuery(
|
||||
"/RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value='" + ruleId + "' or .//CLEANER_NAME='" + ruleId + "']//CONFIGURATION");
|
||||
|
||||
final SAXReader reader = new SAXReader();
|
||||
final Document doc = reader.read(new StringReader(prof));
|
||||
|
||||
final CleaningRule rule = new CleaningRule();
|
||||
|
||||
final ISLookUpService lookup = serviceLocator.getService(ISLookUpService.class);
|
||||
|
||||
for (Object o : doc.selectNodes("//RULE")) {
|
||||
final Element node = (Element) o;
|
||||
|
||||
final String xpath = node.valueOf("@xpath");
|
||||
final String vocabularies = node.valueOf("@vocabularies");
|
||||
final String groovyRule = node.valueOf("@groovy");
|
||||
final String strict = node.valueOf("@strict");
|
||||
|
||||
final XPATHCleaningRule xpathRule;
|
||||
if (vocabularies != null && vocabularies.length() > 0) {
|
||||
final Set<String> list = Sets.newHashSet(Splitter.on(",").omitEmptyStrings().trimResults().split(vocabularies));
|
||||
xpathRule = new VocabularyRule(list, lookup);
|
||||
} else {
|
||||
xpathRule = new GroovyRule(groovyRule);
|
||||
}
|
||||
xpathRule.setXpath(xpath);
|
||||
xpathRule.setStrict("true".equals(strict));
|
||||
rule.getXpathRules().add(xpathRule);
|
||||
}
|
||||
return rule;
|
||||
} catch (Exception e) {
|
||||
throw new CleanerException("Error obtaing cleaner rule " + ruleId, e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getRuleIds() throws CleanerException {
|
||||
try {
|
||||
final HashSet<String> response = new HashSet<String>();
|
||||
|
||||
final List<String> list = serviceLocator.getService(ISLookUpService.class).quickSearchProfile("//CLEANER_NAME");
|
||||
if (list != null) {
|
||||
response.addAll(list);
|
||||
}
|
||||
|
||||
return Lists.newArrayList(response);
|
||||
} catch (ISLookUpException e) {
|
||||
throw new CleanerException("Error obtaining IDs of cleaner DSs", e);
|
||||
}
|
||||
}
|
||||
|
||||
public UniqueServiceLocator getServiceLocator() {
|
||||
return serviceLocator;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setServiceLocator(final UniqueServiceLocator serviceLocator) {
|
||||
this.serviceLocator = serviceLocator;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package eu.dnetlib.data.utility.cleaner;
|
||||
|
||||
import eu.dnetlib.data.utility.cleaner.rmi.CleanerException;
|
||||
import groovy.lang.Closure;
|
||||
import groovy.lang.GroovyShell;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* @author michele
|
||||
*
|
||||
* Groovy rules must be declared in a CleanerDS profile, some examples:
|
||||
*
|
||||
* <RULE xpath="..." groovy="(input =~ /X/).replaceAll('Y')" /> <RULE xpath="..." groovy="'CONSTANT'" /> <RULE xpath="..."
|
||||
* groovy="input.toUpperCase()" />
|
||||
*/
|
||||
|
||||
public class GroovyRule extends XPATHCleaningRule {
|
||||
|
||||
private static final Log log = LogFactory.getLog(GroovyRule.class); // NOPMD by marko on 11/24/08 5:02 PM
|
||||
|
||||
private String groovyRule;
|
||||
private Closure<String> closure;
|
||||
|
||||
private GroovyShell groovyShell = new GroovyShell();
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public GroovyRule(final String groovyRule) {
|
||||
this.groovyRule = groovyRule;
|
||||
this.closure = (Closure<String>) groovyShell.evaluate("{ input -> " + groovyRule + "}");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String calculateNewValue(final String oldValue) throws CleanerException {
|
||||
try {
|
||||
log.info("Executing groovy closure on value " + oldValue);
|
||||
return closure.call(oldValue);
|
||||
} catch (Exception e) {
|
||||
log.error("Failed Groovy execution, groovyRule: " + groovyRule + ", input: " + oldValue, e);
|
||||
throw new CleanerException("Error executing groovy", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, String> verifyValue(final String value) throws CleanerException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "GROOVY: " + groovyRule;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
package eu.dnetlib.data.utility.cleaner;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.data.utility.cleaner.rmi.CleanerException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
/**
|
||||
* @author michele
|
||||
*
|
||||
* Vocabulary rules must be declared in a CleanerDS profile, for each vocabulary must be present the relative VocabularyDS profile:
|
||||
*
|
||||
* <RULE xpath="..." vocabularies="VOC1" /> <RULE xpath="..." vocabularies="VOC1, VOC2, VOC3" />
|
||||
*/
|
||||
|
||||
public class VocabularyRule extends XPATHCleaningRule {
|
||||
|
||||
private Set<String> vocabularies;
|
||||
|
||||
private static final Log log = LogFactory.getLog(VocabularyRule.class); // NOPMD by marko on 11/24/08 5:02 PM
|
||||
|
||||
private Map<String, String> synonyms = Maps.newHashMap();
|
||||
private Set<String> validTerms = Sets.newHashSet();
|
||||
|
||||
public VocabularyRule(final Set<String> vocabularies, final ISLookUpService lookup) throws CleanerException {
|
||||
this.vocabularies = vocabularies;
|
||||
|
||||
loadSynonymsAndTerms(lookup);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String calculateNewValue(final String oldValue) throws CleanerException {
|
||||
log.debug("calculating new value for: " + oldValue);
|
||||
|
||||
if (synonyms.isEmpty()) {
|
||||
log.warn("Vocabulary terms is void, vocabularies: " + this.vocabularies);
|
||||
}
|
||||
|
||||
String newValue = null;
|
||||
|
||||
if (synonyms.containsKey(oldValue.toLowerCase())) {
|
||||
newValue = synonyms.get(oldValue.toLowerCase());
|
||||
}
|
||||
|
||||
if (newValue == null) {
|
||||
log.debug("Synonym " + oldValue + " not found in vocabulary");
|
||||
return oldValue;
|
||||
}
|
||||
|
||||
return newValue;
|
||||
}
|
||||
|
||||
private void loadSynonymsAndTerms(final ISLookUpService lookup) throws CleanerException {
|
||||
|
||||
for (final String vocabulary : vocabularies) {
|
||||
try {
|
||||
final String query = "for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')"
|
||||
+ "//RESOURCE_PROFILE[.//VOCABULARY_NAME/@code='" + vocabulary + "']//TERM return "
|
||||
+ "( concat($x/@code,'|-:-|', $x/@code), concat($x/@english_name,'|-:-|', $x/@code), concat($x/@native_name,'|-:-|', $x/@code), "
|
||||
+ "for $y in $x//SYNONYM return concat($y/@term,'|-:-|', $x/@code) )";
|
||||
|
||||
for (final String s : lookup.quickSearchProfile(query)) {
|
||||
log.debug("SYNONYM : " + s);
|
||||
final String[] arr = s.split("\\|-:-\\|");
|
||||
if (arr[0] == null || arr[0].isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
synonyms.put(arr[0].toLowerCase(), arr[1]);
|
||||
validTerms.add(arr[1].toLowerCase());
|
||||
}
|
||||
|
||||
log.info("VOCABULARY " + vocabulary.trim() + " - terms size " + synonyms.size());
|
||||
} catch (final Exception e) {
|
||||
throw new CleanerException("Error obtaining vocabulary " + vocabulary, e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, String> verifyValue(final String value) throws CleanerException {
|
||||
if (synonyms.isEmpty()) {
|
||||
log.warn("Vocabulary terms is void, vocabularies: " + this.vocabularies);
|
||||
}
|
||||
|
||||
if (validTerms.contains(value.toLowerCase())) { return null; }
|
||||
|
||||
final Map<String, String> error = new HashMap<String, String>();
|
||||
error.put("term", value);
|
||||
error.put("vocabularies", this.vocabularies.toString().replaceAll("\\[", "").replaceAll("\\]", ""));
|
||||
error.put("xpath", this.getXpath());
|
||||
return error;
|
||||
}
|
||||
|
||||
public Map<String, String> getVocabularyTerms() {
|
||||
return synonyms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "VOCABULARIES: [" + Joiner.on(", ").join(vocabularies) + "]";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
package eu.dnetlib.data.utility.cleaner;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.data.utility.cleaner.rmi.CleanerException;
|
||||
|
||||
public abstract class XPATHCleaningRule {
|
||||
|
||||
private String xpath;
|
||||
private boolean strict = false;
|
||||
|
||||
private static final Log logCleaningRules = LogFactory.getLog("VOCABULARY_RULES");
|
||||
|
||||
public List<Map<String, String>> applyXpathRule(final Document doc) throws CleanerException {
|
||||
final List<Map<String, String>> errors = Lists.newArrayList();
|
||||
|
||||
final String id = doc.valueOf("//*[local-name()='objIdentifier']");
|
||||
|
||||
for (Object o : doc.selectNodes(xpath)) {
|
||||
final Node node = (Node) o;
|
||||
|
||||
final String oldValue = node.getText().trim();
|
||||
|
||||
final String newValue = calculateNewValue(oldValue);
|
||||
if (strict) {
|
||||
final Map<String, String> err = verifyValue(newValue);
|
||||
if (err != null) {
|
||||
errors.add(err);
|
||||
|
||||
if (logCleaningRules.isInfoEnabled()) {
|
||||
logCleaningRules.info("[" + newValue + "] is INVALID, " + "RULE: " + toString() + ", " + "RECORD: " + id + ", " + "XPATH: "
|
||||
+ this.getXpath());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (logCleaningRules.isInfoEnabled() && !newValue.equals(oldValue)) {
|
||||
logCleaningRules.info("[" + oldValue + "] => [" + newValue + "], " + toString() + ", " + "RECORD: " + id + ", " + "XPATH: " + this.getXpath());
|
||||
}
|
||||
|
||||
node.setText(newValue);
|
||||
}
|
||||
|
||||
return errors;
|
||||
}
|
||||
|
||||
protected abstract Map<String, String> verifyValue(final String value) throws CleanerException;
|
||||
|
||||
protected abstract String calculateNewValue(final String oldValue) throws CleanerException;
|
||||
|
||||
public String getXpath() {
|
||||
return xpath;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setXpath(final String xpath) {
|
||||
this.xpath = xpath;
|
||||
}
|
||||
|
||||
public boolean isStrict() {
|
||||
return strict;
|
||||
}
|
||||
|
||||
public void setStrict(final boolean strict) {
|
||||
this.strict = strict;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
package eu.dnetlib.data.utility.cleaner.inspector;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
|
||||
import org.springframework.stereotype.Controller;
|
||||
import org.springframework.ui.Model;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.data.utility.cleaner.CleaningRule;
|
||||
import eu.dnetlib.data.utility.cleaner.CleaningRuleFactory;
|
||||
import eu.dnetlib.data.utility.cleaner.rmi.CleanerException;
|
||||
import eu.dnetlib.enabling.inspector.AbstractInspectorController;
|
||||
import eu.dnetlib.miscutils.collections.MappedCollection;
|
||||
import eu.dnetlib.miscutils.functional.UnaryFunction;
|
||||
|
||||
@Controller
|
||||
public class CleanerInspector extends AbstractInspectorController {
|
||||
|
||||
@Resource
|
||||
private CleaningRuleFactory cleaningRuleFactory;
|
||||
|
||||
public static class SelectOption {
|
||||
|
||||
private String value;
|
||||
private boolean selected;
|
||||
|
||||
public SelectOption(final String value, final boolean selected) {
|
||||
super();
|
||||
this.value = value;
|
||||
this.selected = selected;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(final String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public boolean isSelected() {
|
||||
return selected;
|
||||
}
|
||||
|
||||
public void setSelected(final boolean selected) {
|
||||
this.selected = selected;
|
||||
}
|
||||
}
|
||||
|
||||
@RequestMapping(value = "/inspector/cleaner.do")
|
||||
public void cleaner(final Model model,
|
||||
@RequestParam(value = "rule", required = false) final String ruleId,
|
||||
@RequestParam(value = "dirtyText", required = false) final String dirtyText) throws CleanerException {
|
||||
|
||||
List<String> rules = Lists.newArrayList(cleaningRuleFactory.getRuleIds());
|
||||
model.addAttribute("rules", selectOptions(rules, ruleId));
|
||||
|
||||
if ((ruleId != null) && (dirtyText != null)) {
|
||||
CleaningRule rule = cleaningRuleFactory.obtainCleaningRule(ruleId);
|
||||
model.addAttribute("dirtyText", dirtyText);
|
||||
model.addAttribute("cleanedText", rule.evaluate(dirtyText));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given an list of values, return a list of SelectOption instances which have the "selected" boolean field set to true only for the
|
||||
* element matching "current".
|
||||
*
|
||||
* @param input
|
||||
* list of input strings
|
||||
* @param current
|
||||
* current value to select
|
||||
* @return
|
||||
*/
|
||||
private List<SelectOption> selectOptions(final List<String> input, final String current) {
|
||||
final UnaryFunction<SelectOption, String> mapper = new UnaryFunction<SelectOption, String>() {
|
||||
|
||||
@Override
|
||||
public SelectOption evaluate(final String value) {
|
||||
return new SelectOption(value, value.equals(current));
|
||||
}
|
||||
};
|
||||
return Lists.newArrayList(new MappedCollection<SelectOption, String>(input, mapper));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package eu.dnetlib.data.utility.cleaner.rmi;
|
||||
|
||||
import eu.dnetlib.common.rmi.RMIException;
|
||||
|
||||
public class CleanerException extends RMIException {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -7889315488590536918L;
|
||||
|
||||
public CleanerException(final Throwable e) {
|
||||
super(e);
|
||||
}
|
||||
|
||||
public CleanerException(final String message, final Throwable e) {
|
||||
super(message, e);
|
||||
}
|
||||
|
||||
public CleanerException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package eu.dnetlib.data.utility.cleaner.rmi;
|
||||
|
||||
import javax.jws.WebService;
|
||||
import javax.xml.ws.wsaddressing.W3CEndpointReference;
|
||||
|
||||
import eu.dnetlib.common.rmi.BaseService;
|
||||
|
||||
/**
|
||||
* @author michele
|
||||
*
|
||||
*/
|
||||
@WebService(targetNamespace = "http://services.dnetlib.eu/")
|
||||
public interface CleanerService extends BaseService {
|
||||
|
||||
/**
|
||||
* @param epr
|
||||
* an epr of a resultset with dirty records
|
||||
* @param ruleId
|
||||
* the identifier of a rule
|
||||
* @return an epr of a resultset with clean records
|
||||
* @throws CleanerException
|
||||
*/
|
||||
W3CEndpointReference clean(final W3CEndpointReference epr, final String ruleId) throws CleanerException;
|
||||
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
service.cleaner.mapped.resultset.factory=mappedResultSetFactory
|
|
@ -0,0 +1,36 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<beans xmlns="http://www.springframework.org/schema/beans"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:jaxws="http://cxf.apache.org/jaxws"
|
||||
xmlns:sec="http://cxf.apache.org/configuration/security" xmlns:wsa="http://cxf.apache.org/ws/addressing"
|
||||
xmlns:p="http://www.springframework.org/schema/p" xmlns:http="http://cxf.apache.org/transports/http/configuration"
|
||||
xmlns:t="http://dnetlib.eu/springbeans/t" xmlns:template="http://dnetlib.eu/springbeans/template"
|
||||
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
|
||||
http://cxf.apache.org/ws/addressing http://cxf.apache.org/schemas/ws-addr-conf.xsd
|
||||
http://cxf.apache.org/configuration/security http://cxf.apache.org/schemas/configuration/security.xsd
|
||||
http://cxf.apache.org/transports/http/configuration http://cxf.apache.org/schemas/configuration/http-conf.xsd
|
||||
http://cxf.apache.org/jaxws http://cxf.apache.org/schemas/jaxws.xsd
|
||||
http://dnetlib.eu/springbeans/template http://dnetlib.eu/springbeans/template.xsd">
|
||||
|
||||
<!-- beans -->
|
||||
<bean id="cleanerService"
|
||||
class="eu.dnetlib.data.utility.cleaner.CleanerServiceImpl"
|
||||
init-method="start" destroy-method="stop" p:cleaningRuleFactory-ref="cleaningRuleFactory"
|
||||
p:mappedResultSetFactory-ref="${service.cleaner.mapped.resultset.factory}" />
|
||||
|
||||
<bean id="cleaningRuleFactory"
|
||||
class="eu.dnetlib.data.utility.cleaner.CleaningRuleFactory"
|
||||
p:serviceLocator-ref="uniqueServiceLocator">
|
||||
</bean>
|
||||
|
||||
<!-- endpoints -->
|
||||
<jaxws:endpoint id="cleanerServiceEndpoint" implementor="#cleanerService"
|
||||
implementorClass="eu.dnetlib.data.utility.cleaner.rmi.CleanerService"
|
||||
address="/cleaner" />
|
||||
|
||||
<template:instance name="serviceRegistrationManager"
|
||||
t:serviceRegistrationManagerClass="eu.dnetlib.enabling.tools.registration.ValidatingServiceRegistrationManagerImpl"
|
||||
t:name="cleanerServiceRegistrationManager" t:service="cleanerService"
|
||||
t:endpoint="cleanerServiceEndpoint" t:jobScheduler="jobScheduler"
|
||||
t:serviceRegistrator="blackboardServiceRegistrator" />
|
||||
|
||||
</beans>
|
|
@ -0,0 +1,19 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<beans xmlns="http://www.springframework.org/schema/beans"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:p="http://www.springframework.org/schema/p"
|
||||
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd">
|
||||
|
||||
<bean id="cleanerInspectorGroup"
|
||||
class="eu.dnetlib.enabling.inspector.StaticEntryPointDescriptorGroup"
|
||||
p:name="cleaner">
|
||||
<property name="descriptors">
|
||||
<list>
|
||||
<bean class="eu.dnetlib.enabling.inspector.StaticEntryPointDescriptor"
|
||||
p:name="cleaner" p:relativeUrl="cleaner.do"
|
||||
p:hiddenAsDefault="true"/>
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
</beans>
|
|
@ -0,0 +1,36 @@
|
|||
$inspector/master(it={
|
||||
|
||||
<style type="text/css">
|
||||
#results {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
#results td:first-child {
|
||||
width: 2em;
|
||||
}
|
||||
|
||||
#results td {
|
||||
border: 1px solid #cecece;
|
||||
}
|
||||
</style>
|
||||
|
||||
<h2>Browse indices</h2>
|
||||
|
||||
<form method="POST">
|
||||
Cleaner rules:
|
||||
<select name="rule">
|
||||
$rules:{<option $if(it.selected)$selected$endif$>$it.value$</option>}$
|
||||
</select><br /><br />
|
||||
|
||||
Dirty Record:<br />
|
||||
<textarea name="dirtyText" cols="80" rows="10">$dirtyText$</textarea>
|
||||
<br /><br />
|
||||
<input type="submit" value="submit"/>
|
||||
</form>
|
||||
<br />
|
||||
|
||||
Cleaned Record:<br />
|
||||
<textarea readonly="readonly" cols="80" rows="10">$cleanedText$</textarea>
|
||||
|
||||
|
||||
})$
|
|
@ -0,0 +1,67 @@
|
|||
package eu.dnetlib.data.utility.cleaner;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import javax.xml.ws.wsaddressing.W3CEndpointReference;
|
||||
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.runners.MockitoJUnit44Runner;
|
||||
|
||||
import eu.dnetlib.data.utility.cleaner.rmi.CleanerException;
|
||||
import eu.dnetlib.enabling.resultset.MappedResultSetFactory;
|
||||
import eu.dnetlib.test.utils.EPRTestUtil;
|
||||
|
||||
@RunWith(MockitoJUnit44Runner.class)
|
||||
public class CleanerServiceImplTest {
|
||||
|
||||
/**
|
||||
* Class under test.
|
||||
*/
|
||||
private CleanerServiceImpl service;
|
||||
|
||||
@Mock
|
||||
private CleaningRuleFactory cleaningRuleFactory;
|
||||
@Mock
|
||||
private MappedResultSetFactory mappedResultSetFactory;
|
||||
@Mock
|
||||
private CleaningRule cleaningRule;
|
||||
|
||||
private W3CEndpointReference epr_IN = EPRTestUtil.getTestEpr("http://1");
|
||||
private W3CEndpointReference epr_OUT = EPRTestUtil.getTestEpr("http://2");
|
||||
|
||||
private static final String RULE_ID = "RULE_01";
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
when(cleaningRuleFactory.obtainCleaningRule(RULE_ID)).thenReturn(cleaningRule);
|
||||
when(mappedResultSetFactory.createMappedResultSet(epr_IN, cleaningRule)).thenReturn(epr_OUT);
|
||||
|
||||
service = new CleanerServiceImpl();
|
||||
service.setCleaningRuleFactory(cleaningRuleFactory);
|
||||
service.setMappedResultSetFactory(mappedResultSetFactory);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClean() throws CleanerException {
|
||||
W3CEndpointReference epr = service.clean(epr_IN, RULE_ID);
|
||||
assertNotNull(epr);
|
||||
assertEquals(epr_OUT, epr);
|
||||
verify(cleaningRuleFactory).obtainCleaningRule(RULE_ID);
|
||||
}
|
||||
|
||||
@Test(expected = CleanerException.class)
|
||||
public void testClean_null_1() throws CleanerException {
|
||||
service.clean(epr_IN, null);
|
||||
}
|
||||
|
||||
@Test(expected = CleanerException.class)
|
||||
public void testClean_null_2() throws CleanerException {
|
||||
service.clean(null, RULE_ID);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,135 @@
|
|||
package eu.dnetlib.data.utility.cleaner;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.List;
|
||||
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.mockito.runners.MockitoJUnit44Runner;
|
||||
|
||||
@RunWith(MockitoJUnit44Runner.class)
|
||||
public class GroovyRuleTest {
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule_simple_constant() throws Exception {
|
||||
final GroovyRule rule = new GroovyRule("'YYYY'");
|
||||
|
||||
final String xpath = "/a/b";
|
||||
|
||||
rule.setXpath(xpath);
|
||||
|
||||
final Document doc = execute(rule, "<a><b>XXXX</b></a>");
|
||||
|
||||
assertEquals("YYYY", doc.valueOf(xpath));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule_simple_regex() throws Exception {
|
||||
final GroovyRule rule = new GroovyRule("(input =~ /X/).replaceAll('Y')");
|
||||
|
||||
final String xpath = "/a/b";
|
||||
|
||||
rule.setXpath(xpath);
|
||||
|
||||
final Document doc = execute(rule, "<a><b>aXaXa</b></a>");
|
||||
|
||||
assertEquals("aYaYa", doc.valueOf(xpath));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule_simple_upper() throws Exception {
|
||||
final GroovyRule rule = new GroovyRule("input.toUpperCase()");
|
||||
|
||||
final String xpath = "/a/b";
|
||||
|
||||
rule.setXpath(xpath);
|
||||
|
||||
final Document doc = execute(rule, "<a><b>xyz</b></a>");
|
||||
|
||||
assertEquals("XYZ", doc.valueOf(xpath));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule_multi() throws Exception {
|
||||
final GroovyRule rule = new GroovyRule("'Y'");
|
||||
|
||||
final String xpath = "/a/b";
|
||||
|
||||
rule.setXpath(xpath);
|
||||
|
||||
final Document doc = execute(rule, "<a><b>X</b><b>X</b><b>X</b></a>");
|
||||
|
||||
List<?> list = doc.selectNodes(xpath);
|
||||
|
||||
assertEquals(3, list.size());
|
||||
for (Object o : list) {
|
||||
assertEquals("Y", ((Node) o).getText());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule_singleAttr() throws Exception {
|
||||
final GroovyRule rule = new GroovyRule("'BBBB'");
|
||||
|
||||
final String xpath = "/a/b/@value";
|
||||
|
||||
rule.setXpath(xpath);
|
||||
|
||||
final Document doc = execute(rule, "<a><b value='AAAA'>XXXX</b></a>");
|
||||
|
||||
assertEquals("BBBB", doc.valueOf(xpath));
|
||||
assertEquals("XXXX", doc.valueOf("/a/b"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule_multiAttr() throws Exception {
|
||||
final GroovyRule rule = new GroovyRule("'B'");
|
||||
|
||||
final String xpath = "/a/b/@value";
|
||||
|
||||
rule.setXpath(xpath);
|
||||
|
||||
final Document doc = execute(rule, "<a><b value='a' /><b value='b' /><b value='c' /></a>");
|
||||
|
||||
final List<?> list = doc.selectNodes(xpath);
|
||||
|
||||
assertEquals(3, list.size());
|
||||
for (Object o : list) {
|
||||
assertEquals("B", ((Node) o).getText());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule_complex() throws Exception {
|
||||
final GroovyRule rule = new GroovyRule("'B'");
|
||||
|
||||
final String xpath = "/a/b";
|
||||
|
||||
rule.setXpath(xpath);
|
||||
|
||||
final Document doc = execute(rule, "<a><b>X<c>C</c></b></a>");
|
||||
|
||||
assertTrue(doc.valueOf(xpath).contains("B"));
|
||||
assertEquals("C", doc.valueOf("/a/b/c"));
|
||||
}
|
||||
|
||||
private Document execute(final GroovyRule rule, final String xml) throws Exception {
|
||||
|
||||
final SAXReader reader = new SAXReader();
|
||||
final Document doc = reader.read(new StringReader(xml));
|
||||
|
||||
System.out.println("BEFORE:\n" + doc.asXML() + "\n");
|
||||
rule.applyXpathRule(doc);
|
||||
System.out.println("AFTER:\n" + doc.asXML() + "\n");
|
||||
|
||||
System.out.println("-----------------------------\n");
|
||||
return doc;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,128 @@
|
|||
package eu.dnetlib.data.utility.cleaner;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertNull;
|
||||
import static org.mockito.Matchers.anyString;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.List;
|
||||
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.runners.MockitoJUnit44Runner;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
@RunWith(MockitoJUnit44Runner.class)
|
||||
public class VocabularyRuleTest {
|
||||
|
||||
private static final String VOCABULARY_NAME_1 = "TEST VOCABULARY 1";
|
||||
private static final String VOCABULARY_NAME_2 = "TEST VOCABULARY 2";
|
||||
private static final List<String> VOCABULARY = Lists.newArrayList("XXXX|-:-|AAAA", "YYYY|-:-|AAAA", "ZZZZ|-:-|AAAA");
|
||||
|
||||
/**
|
||||
* Class Under Test
|
||||
*/
|
||||
private VocabularyRule rule;
|
||||
|
||||
@Mock
|
||||
private ISLookUpService lookup;
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
when(lookup.quickSearchProfile(anyString())).thenReturn(VOCABULARY);
|
||||
|
||||
rule = new VocabularyRule(Sets.newHashSet(VOCABULARY_NAME_1, VOCABULARY_NAME_2), lookup);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetup() throws Exception {
|
||||
final String xpath = "/a/b";
|
||||
rule.setXpath(xpath);
|
||||
|
||||
execute("<a><b>XXXX</b></a>");
|
||||
|
||||
verify(lookup, times(2)).quickSearchProfile(anyString());
|
||||
assertEquals(VOCABULARY.size(), rule.getVocabularyTerms().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule() throws Exception {
|
||||
final String xpath = "/a/b";
|
||||
rule.setXpath(xpath);
|
||||
final Document doc = execute("<a><b>XXXX</b></a>");
|
||||
assertEquals("AAAA", doc.valueOf(xpath));
|
||||
assertNull(rule.verifyValue("AAAA"));
|
||||
assertNotNull(rule.verifyValue("XXXX"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule_2() throws Exception {
|
||||
final String xpath = "/a/b";
|
||||
rule.setXpath(xpath);
|
||||
final Document doc = execute("<a><b>XXXX</b></a>");
|
||||
assertEquals("AAAA", doc.valueOf(xpath));
|
||||
assertNull(rule.verifyValue("AAAA"));
|
||||
assertNotNull(rule.verifyValue("XXXX"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule_3() throws Exception {
|
||||
final String xpath = "/a/b";
|
||||
rule.setXpath(xpath);
|
||||
final Document doc = execute("<a><b>XXXX</b></a>");
|
||||
assertEquals("AAAA", doc.valueOf(xpath));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule_attr() throws Exception {
|
||||
final String xpath = "/a/b/@value";
|
||||
rule.setXpath(xpath);
|
||||
final Document doc = execute("<a><b value='XXXX' /></a>");
|
||||
assertEquals("AAAA", doc.valueOf(xpath));
|
||||
assertNull(rule.verifyValue("AAAA"));
|
||||
assertNotNull(rule.verifyValue("XXXX"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule_with_spaces() throws Exception {
|
||||
final String xpath = "/a/b";
|
||||
rule.setXpath(xpath);
|
||||
final Document doc = execute("<a><b> XXXX </b></a>");
|
||||
assertEquals("AAAA", doc.valueOf(xpath));
|
||||
assertNull(rule.verifyValue("AAAA"));
|
||||
assertNotNull(rule.verifyValue(" XXXX "));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyXpathRule_case() throws Exception {
|
||||
final String xpath = "/a/b";
|
||||
rule.setXpath(xpath);
|
||||
final Document doc = execute("<a><b>Xxxx</b></a>");
|
||||
assertEquals("AAAA", doc.valueOf(xpath));
|
||||
assertNull(rule.verifyValue("AAAA"));
|
||||
assertNotNull(rule.verifyValue("Xxxx"));
|
||||
}
|
||||
|
||||
private Document execute(final String xml) throws Exception {
|
||||
final SAXReader reader = new SAXReader();
|
||||
final Document doc = reader.read(new StringReader(xml));
|
||||
System.out.println("BEFORE:\n" + doc.asXML() + "\n");
|
||||
rule.applyXpathRule(doc);
|
||||
System.out.println("AFTER:\n" + doc.asXML() + "\n");
|
||||
System.out.println("-----------------------------\n");
|
||||
return doc;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
package eu.dnetlib.data.utility.cleaner;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.mockito.runners.MockitoJUnit44Runner;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.data.utility.cleaner.rmi.CleanerException;
|
||||
|
||||
@RunWith(MockitoJUnit44Runner.class)
|
||||
public class XMLCleaningRuleTest {
|
||||
|
||||
private static final String INPUT_VALID = "<record>" + " <header />" + " <metadata>" + " <a>HELLO</a>" + " </metadata>" + "</record>";
|
||||
|
||||
private static final String INPUT_INVALID = "<record>" + " <header />" + " <metadata>" + " <a>GOOD BYE</a>" + " </metadata>" + "</record>";
|
||||
|
||||
/**
|
||||
* Class under test.
|
||||
*/
|
||||
private CleaningRule xmlRule;
|
||||
private XPATHCleaningRule mockXpathRule = new XPATHCleaningRule() {
|
||||
|
||||
@Override
|
||||
protected Map<String, String> verifyValue(final String value) throws CleanerException {
|
||||
if (value.equals("CIAO")) { return null; }
|
||||
|
||||
Map<String, String> err = new HashMap<String, String>();
|
||||
err.put("term", value);
|
||||
return err;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String calculateNewValue(final String oldValue) throws CleanerException {
|
||||
if (oldValue.equals("HELLO")) { return "CIAO"; }
|
||||
return oldValue;
|
||||
}
|
||||
};;
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
xmlRule = new CleaningRule();
|
||||
|
||||
mockXpathRule.setStrict(true);
|
||||
mockXpathRule.setXpath("//a");
|
||||
|
||||
xmlRule.setXpathRules(Lists.newArrayList(mockXpathRule));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvaluate_valid() {
|
||||
String s = xmlRule.evaluate(INPUT_VALID);
|
||||
assertTrue(s.contains("CIAO"));
|
||||
assertFalse(s.contains("invalid"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvaluate_invalid() {
|
||||
String s = xmlRule.evaluate(INPUT_INVALID);
|
||||
System.out.println(s);
|
||||
assertFalse(s.contains("CIAO"));
|
||||
assertTrue(s.contains("invalid"));
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue