From 337b42e94593d4e9a7d0095a5d2b0dfe4066fea2 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 13 Jun 2019 17:58:17 +0200 Subject: [PATCH] imported cnr-data-utility-cleaner-rmi and cnr-data-utility-cleaner-service in dnet-data-services --- dnet-data-services/pom.xml | 5 + .../utility/cleaner/CleanerServiceImpl.java | 45 ++++++ .../data/utility/cleaner/CleaningRule.java | 67 +++++++++ .../utility/cleaner/CleaningRuleFactory.java | 86 +++++++++++ .../data/utility/cleaner/GroovyRule.java | 56 ++++++++ .../data/utility/cleaner/VocabularyRule.java | 113 +++++++++++++++ .../utility/cleaner/XPATHCleaningRule.java | 77 ++++++++++ .../cleaner/inspector/CleanerInspector.java | 91 ++++++++++++ .../utility/cleaner/rmi/CleanerException.java | 24 ++++ .../utility/cleaner/rmi/CleanerService.java | 25 ++++ .../applicationContext-cleaner.properties | 1 + .../cleaner/applicationContext-cleaner.xml | 36 +++++ .../webContext-cleaner-inspector.xml | 19 +++ .../enabling/views/inspector/cleaner.st | 36 +++++ .../cleaner/CleanerServiceImplTest.java | 67 +++++++++ .../data/utility/cleaner/GroovyRuleTest.java | 135 ++++++++++++++++++ .../utility/cleaner/VocabularyRuleTest.java | 128 +++++++++++++++++ .../utility/cleaner/XMLCleaningRuleTest.java | 72 ++++++++++ pom.xml | 7 + 19 files changed, 1090 insertions(+) create mode 100644 dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/CleanerServiceImpl.java create mode 100644 dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/CleaningRule.java create mode 100644 dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/CleaningRuleFactory.java create mode 100644 dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/GroovyRule.java create mode 100644 dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/VocabularyRule.java create mode 100644 dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/XPATHCleaningRule.java create mode 100644 dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/inspector/CleanerInspector.java create mode 100644 dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/rmi/CleanerException.java create mode 100644 dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/rmi/CleanerService.java create mode 100644 dnet-data-services/src/main/resources/eu/dnetlib/data/cleaner/applicationContext-cleaner.properties create mode 100644 dnet-data-services/src/main/resources/eu/dnetlib/data/cleaner/applicationContext-cleaner.xml create mode 100644 dnet-data-services/src/main/resources/eu/dnetlib/data/cleaner/inspector/webContext-cleaner-inspector.xml create mode 100644 dnet-data-services/src/main/resources/eu/dnetlib/enabling/views/inspector/cleaner.st create mode 100644 dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/CleanerServiceImplTest.java create mode 100644 dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/GroovyRuleTest.java create mode 100644 dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/VocabularyRuleTest.java create mode 100644 dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/XMLCleaningRuleTest.java diff --git a/dnet-data-services/pom.xml b/dnet-data-services/pom.xml index 2d13584..405b776 100644 --- a/dnet-data-services/pom.xml +++ b/dnet-data-services/pom.xml @@ -28,6 +28,11 @@ commons-beanutils + + org.codehaus.groovy + groovy + + org.json json diff --git a/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/CleanerServiceImpl.java b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/CleanerServiceImpl.java new file mode 100644 index 0000000..06f2a24 --- /dev/null +++ b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/CleanerServiceImpl.java @@ -0,0 +1,45 @@ +package eu.dnetlib.data.utility.cleaner; + +import javax.xml.ws.wsaddressing.W3CEndpointReference; + +import org.springframework.beans.factory.annotation.Required; + +import eu.dnetlib.data.utility.cleaner.rmi.CleanerException; +import eu.dnetlib.data.utility.cleaner.rmi.CleanerService; +import eu.dnetlib.enabling.resultset.MappedResultSetFactory; +import eu.dnetlib.enabling.tools.AbstractBaseService; + +public class CleanerServiceImpl extends AbstractBaseService implements CleanerService { + + private CleaningRuleFactory cleaningRuleFactory; + + private MappedResultSetFactory mappedResultSetFactory; + + @Override + public W3CEndpointReference clean(final W3CEndpointReference epr, final String ruleId) throws CleanerException { + if ((ruleId == null) || (ruleId.isEmpty())) { throw new CleanerException("Invalid ruleId: id is empty"); } + if (epr == null) { throw new CleanerException("Passed epr is empty"); } + + return mappedResultSetFactory.createMappedResultSet(epr, cleaningRuleFactory.obtainCleaningRule(ruleId)); + } + + @Required + public MappedResultSetFactory getMappedResultSetFactory() { + return mappedResultSetFactory; + } + + @Required + public void setMappedResultSetFactory(final MappedResultSetFactory mappedResultSetFactory) { + this.mappedResultSetFactory = mappedResultSetFactory; + } + + public CleaningRuleFactory getCleaningRuleFactory() { + return cleaningRuleFactory; + } + + @Required + public void setCleaningRuleFactory(final CleaningRuleFactory cleaningRuleFactory) { + this.cleaningRuleFactory = cleaningRuleFactory; + } + +} diff --git a/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/CleaningRule.java b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/CleaningRule.java new file mode 100644 index 0000000..a46c9ef --- /dev/null +++ b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/CleaningRule.java @@ -0,0 +1,67 @@ +package eu.dnetlib.data.utility.cleaner; + +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Document; +import org.dom4j.Element; +import org.dom4j.Namespace; +import org.dom4j.QName; +import org.dom4j.io.SAXReader; +import org.springframework.beans.factory.annotation.Required; + +import eu.dnetlib.miscutils.functional.UnaryFunction; + +public class CleaningRule implements UnaryFunction { + + private static final Log log = LogFactory.getLog(CleaningRule.class); // NOPMD by marko on 11/24/08 5:02 PM + + private List xpathRules = new ArrayList(); + + @Override + public String evaluate(final String text) { + + try { + final List> errors = new ArrayList>(); + final Document doc = (new SAXReader()).read(new StringReader(text)); + for (final XPATHCleaningRule r : xpathRules) { + errors.addAll(r.applyXpathRule(doc)); + } + if (errors.size() > 0) { + markAsInvalid(doc, errors); + } + return doc.asXML(); + } catch (final Exception e) { + log.error("Error evaluating rule", e); + } + return ""; + } + + private void markAsInvalid(final Document doc, final List> errors) { + final Element element = (Element) doc.selectSingleNode("//*[local-name()='header']"); + if (element != null) { + final Element inv = element.addElement(new QName("invalid", new Namespace("dri", "http://www.driver-repository.eu/namespace/dri"))); + for (final Map e : errors) { + final Element err = inv.addElement(new QName("error", new Namespace("dri", "http://www.driver-repository.eu/namespace/dri"))); + for (final Map.Entry entry : e.entrySet()) { + err.addAttribute(entry.getKey(), entry.getValue()); + } + } + inv.addAttribute("value", "true"); + } + } + + public List getXpathRules() { + return xpathRules; + } + + @Required + public void setXpathRules(final List xpathRules) { + this.xpathRules = xpathRules; + } + +} diff --git a/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/CleaningRuleFactory.java b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/CleaningRuleFactory.java new file mode 100644 index 0000000..21df4d5 --- /dev/null +++ b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/CleaningRuleFactory.java @@ -0,0 +1,86 @@ +package eu.dnetlib.data.utility.cleaner; + +import java.io.StringReader; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.dom4j.Document; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; +import org.springframework.beans.factory.annotation.Required; + +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + +import eu.dnetlib.data.utility.cleaner.rmi.CleanerException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.enabling.locators.UniqueServiceLocator; + +public class CleaningRuleFactory { + + private UniqueServiceLocator serviceLocator; + + public CleaningRule obtainCleaningRule(final String ruleId) throws CleanerException { + try { + final String prof = serviceLocator.getService(ISLookUpService.class).getResourceProfileByQuery( + "/RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value='" + ruleId + "' or .//CLEANER_NAME='" + ruleId + "']//CONFIGURATION"); + + final SAXReader reader = new SAXReader(); + final Document doc = reader.read(new StringReader(prof)); + + final CleaningRule rule = new CleaningRule(); + + final ISLookUpService lookup = serviceLocator.getService(ISLookUpService.class); + + for (Object o : doc.selectNodes("//RULE")) { + final Element node = (Element) o; + + final String xpath = node.valueOf("@xpath"); + final String vocabularies = node.valueOf("@vocabularies"); + final String groovyRule = node.valueOf("@groovy"); + final String strict = node.valueOf("@strict"); + + final XPATHCleaningRule xpathRule; + if (vocabularies != null && vocabularies.length() > 0) { + final Set list = Sets.newHashSet(Splitter.on(",").omitEmptyStrings().trimResults().split(vocabularies)); + xpathRule = new VocabularyRule(list, lookup); + } else { + xpathRule = new GroovyRule(groovyRule); + } + xpathRule.setXpath(xpath); + xpathRule.setStrict("true".equals(strict)); + rule.getXpathRules().add(xpathRule); + } + return rule; + } catch (Exception e) { + throw new CleanerException("Error obtaing cleaner rule " + ruleId, e); + } + } + + public List getRuleIds() throws CleanerException { + try { + final HashSet response = new HashSet(); + + final List list = serviceLocator.getService(ISLookUpService.class).quickSearchProfile("//CLEANER_NAME"); + if (list != null) { + response.addAll(list); + } + + return Lists.newArrayList(response); + } catch (ISLookUpException e) { + throw new CleanerException("Error obtaining IDs of cleaner DSs", e); + } + } + + public UniqueServiceLocator getServiceLocator() { + return serviceLocator; + } + + @Required + public void setServiceLocator(final UniqueServiceLocator serviceLocator) { + this.serviceLocator = serviceLocator; + } +} diff --git a/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/GroovyRule.java b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/GroovyRule.java new file mode 100644 index 0000000..5defb35 --- /dev/null +++ b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/GroovyRule.java @@ -0,0 +1,56 @@ +package eu.dnetlib.data.utility.cleaner; + +import eu.dnetlib.data.utility.cleaner.rmi.CleanerException; +import groovy.lang.Closure; +import groovy.lang.GroovyShell; + +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * @author michele + * + * Groovy rules must be declared in a CleanerDS profile, some examples: + * + * + */ + +public class GroovyRule extends XPATHCleaningRule { + + private static final Log log = LogFactory.getLog(GroovyRule.class); // NOPMD by marko on 11/24/08 5:02 PM + + private String groovyRule; + private Closure closure; + + private GroovyShell groovyShell = new GroovyShell(); + + @SuppressWarnings("unchecked") + public GroovyRule(final String groovyRule) { + this.groovyRule = groovyRule; + this.closure = (Closure) groovyShell.evaluate("{ input -> " + groovyRule + "}"); + } + + @Override + protected String calculateNewValue(final String oldValue) throws CleanerException { + try { + log.info("Executing groovy closure on value " + oldValue); + return closure.call(oldValue); + } catch (Exception e) { + log.error("Failed Groovy execution, groovyRule: " + groovyRule + ", input: " + oldValue, e); + throw new CleanerException("Error executing groovy", e); + } + } + + @Override + protected Map verifyValue(final String value) throws CleanerException { + return null; + } + + @Override + public String toString() { + return "GROOVY: " + groovyRule; + } +} diff --git a/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/VocabularyRule.java b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/VocabularyRule.java new file mode 100644 index 0000000..750bac1 --- /dev/null +++ b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/VocabularyRule.java @@ -0,0 +1,113 @@ +package eu.dnetlib.data.utility.cleaner; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.google.common.base.Joiner; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + +import eu.dnetlib.data.utility.cleaner.rmi.CleanerException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + +/** + * @author michele + * + * Vocabulary rules must be declared in a CleanerDS profile, for each vocabulary must be present the relative VocabularyDS profile: + * + * + */ + +public class VocabularyRule extends XPATHCleaningRule { + + private Set vocabularies; + + private static final Log log = LogFactory.getLog(VocabularyRule.class); // NOPMD by marko on 11/24/08 5:02 PM + + private Map synonyms = Maps.newHashMap(); + private Set validTerms = Sets.newHashSet(); + + public VocabularyRule(final Set vocabularies, final ISLookUpService lookup) throws CleanerException { + this.vocabularies = vocabularies; + + loadSynonymsAndTerms(lookup); + } + + @Override + protected String calculateNewValue(final String oldValue) throws CleanerException { + log.debug("calculating new value for: " + oldValue); + + if (synonyms.isEmpty()) { + log.warn("Vocabulary terms is void, vocabularies: " + this.vocabularies); + } + + String newValue = null; + + if (synonyms.containsKey(oldValue.toLowerCase())) { + newValue = synonyms.get(oldValue.toLowerCase()); + } + + if (newValue == null) { + log.debug("Synonym " + oldValue + " not found in vocabulary"); + return oldValue; + } + + return newValue; + } + + private void loadSynonymsAndTerms(final ISLookUpService lookup) throws CleanerException { + + for (final String vocabulary : vocabularies) { + try { + final String query = "for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')" + + "//RESOURCE_PROFILE[.//VOCABULARY_NAME/@code='" + vocabulary + "']//TERM return " + + "( concat($x/@code,'|-:-|', $x/@code), concat($x/@english_name,'|-:-|', $x/@code), concat($x/@native_name,'|-:-|', $x/@code), " + + "for $y in $x//SYNONYM return concat($y/@term,'|-:-|', $x/@code) )"; + + for (final String s : lookup.quickSearchProfile(query)) { + log.debug("SYNONYM : " + s); + final String[] arr = s.split("\\|-:-\\|"); + if (arr[0] == null || arr[0].isEmpty()) { + continue; + } + synonyms.put(arr[0].toLowerCase(), arr[1]); + validTerms.add(arr[1].toLowerCase()); + } + + log.info("VOCABULARY " + vocabulary.trim() + " - terms size " + synonyms.size()); + } catch (final Exception e) { + throw new CleanerException("Error obtaining vocabulary " + vocabulary, e); + } + } + + } + + @Override + protected Map verifyValue(final String value) throws CleanerException { + if (synonyms.isEmpty()) { + log.warn("Vocabulary terms is void, vocabularies: " + this.vocabularies); + } + + if (validTerms.contains(value.toLowerCase())) { return null; } + + final Map error = new HashMap(); + error.put("term", value); + error.put("vocabularies", this.vocabularies.toString().replaceAll("\\[", "").replaceAll("\\]", "")); + error.put("xpath", this.getXpath()); + return error; + } + + public Map getVocabularyTerms() { + return synonyms; + } + + @Override + public String toString() { + return "VOCABULARIES: [" + Joiner.on(", ").join(vocabularies) + "]"; + } + +} diff --git a/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/XPATHCleaningRule.java b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/XPATHCleaningRule.java new file mode 100644 index 0000000..9968b9b --- /dev/null +++ b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/XPATHCleaningRule.java @@ -0,0 +1,77 @@ +package eu.dnetlib.data.utility.cleaner; + +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Document; +import org.dom4j.Node; +import org.springframework.beans.factory.annotation.Required; + +import com.google.common.collect.Lists; + +import eu.dnetlib.data.utility.cleaner.rmi.CleanerException; + +public abstract class XPATHCleaningRule { + + private String xpath; + private boolean strict = false; + + private static final Log logCleaningRules = LogFactory.getLog("VOCABULARY_RULES"); + + public List> applyXpathRule(final Document doc) throws CleanerException { + final List> errors = Lists.newArrayList(); + + final String id = doc.valueOf("//*[local-name()='objIdentifier']"); + + for (Object o : doc.selectNodes(xpath)) { + final Node node = (Node) o; + + final String oldValue = node.getText().trim(); + + final String newValue = calculateNewValue(oldValue); + if (strict) { + final Map err = verifyValue(newValue); + if (err != null) { + errors.add(err); + + if (logCleaningRules.isInfoEnabled()) { + logCleaningRules.info("[" + newValue + "] is INVALID, " + "RULE: " + toString() + ", " + "RECORD: " + id + ", " + "XPATH: " + + this.getXpath()); + } + } + } + + if (logCleaningRules.isInfoEnabled() && !newValue.equals(oldValue)) { + logCleaningRules.info("[" + oldValue + "] => [" + newValue + "], " + toString() + ", " + "RECORD: " + id + ", " + "XPATH: " + this.getXpath()); + } + + node.setText(newValue); + } + + return errors; + } + + protected abstract Map verifyValue(final String value) throws CleanerException; + + protected abstract String calculateNewValue(final String oldValue) throws CleanerException; + + public String getXpath() { + return xpath; + } + + @Required + public void setXpath(final String xpath) { + this.xpath = xpath; + } + + public boolean isStrict() { + return strict; + } + + public void setStrict(final boolean strict) { + this.strict = strict; + } + +} diff --git a/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/inspector/CleanerInspector.java b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/inspector/CleanerInspector.java new file mode 100644 index 0000000..2abbb66 --- /dev/null +++ b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/inspector/CleanerInspector.java @@ -0,0 +1,91 @@ +package eu.dnetlib.data.utility.cleaner.inspector; + +import java.util.List; + +import javax.annotation.Resource; + +import org.springframework.stereotype.Controller; +import org.springframework.ui.Model; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; + +import com.google.common.collect.Lists; + +import eu.dnetlib.data.utility.cleaner.CleaningRule; +import eu.dnetlib.data.utility.cleaner.CleaningRuleFactory; +import eu.dnetlib.data.utility.cleaner.rmi.CleanerException; +import eu.dnetlib.enabling.inspector.AbstractInspectorController; +import eu.dnetlib.miscutils.collections.MappedCollection; +import eu.dnetlib.miscutils.functional.UnaryFunction; + +@Controller +public class CleanerInspector extends AbstractInspectorController { + + @Resource + private CleaningRuleFactory cleaningRuleFactory; + + public static class SelectOption { + + private String value; + private boolean selected; + + public SelectOption(final String value, final boolean selected) { + super(); + this.value = value; + this.selected = selected; + } + + public String getValue() { + return value; + } + + public void setValue(final String value) { + this.value = value; + } + + public boolean isSelected() { + return selected; + } + + public void setSelected(final boolean selected) { + this.selected = selected; + } + } + + @RequestMapping(value = "/inspector/cleaner.do") + public void cleaner(final Model model, + @RequestParam(value = "rule", required = false) final String ruleId, + @RequestParam(value = "dirtyText", required = false) final String dirtyText) throws CleanerException { + + List rules = Lists.newArrayList(cleaningRuleFactory.getRuleIds()); + model.addAttribute("rules", selectOptions(rules, ruleId)); + + if ((ruleId != null) && (dirtyText != null)) { + CleaningRule rule = cleaningRuleFactory.obtainCleaningRule(ruleId); + model.addAttribute("dirtyText", dirtyText); + model.addAttribute("cleanedText", rule.evaluate(dirtyText)); + } + } + + /** + * Given an list of values, return a list of SelectOption instances which have the "selected" boolean field set to true only for the + * element matching "current". + * + * @param input + * list of input strings + * @param current + * current value to select + * @return + */ + private List selectOptions(final List input, final String current) { + final UnaryFunction mapper = new UnaryFunction() { + + @Override + public SelectOption evaluate(final String value) { + return new SelectOption(value, value.equals(current)); + } + }; + return Lists.newArrayList(new MappedCollection(input, mapper)); + } + +} diff --git a/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/rmi/CleanerException.java b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/rmi/CleanerException.java new file mode 100644 index 0000000..54e93f6 --- /dev/null +++ b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/rmi/CleanerException.java @@ -0,0 +1,24 @@ +package eu.dnetlib.data.utility.cleaner.rmi; + +import eu.dnetlib.common.rmi.RMIException; + +public class CleanerException extends RMIException { + + /** + * + */ + private static final long serialVersionUID = -7889315488590536918L; + + public CleanerException(final Throwable e) { + super(e); + } + + public CleanerException(final String message, final Throwable e) { + super(message, e); + } + + public CleanerException(final String message) { + super(message); + } + +} diff --git a/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/rmi/CleanerService.java b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/rmi/CleanerService.java new file mode 100644 index 0000000..9c48ea2 --- /dev/null +++ b/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/rmi/CleanerService.java @@ -0,0 +1,25 @@ +package eu.dnetlib.data.utility.cleaner.rmi; + +import javax.jws.WebService; +import javax.xml.ws.wsaddressing.W3CEndpointReference; + +import eu.dnetlib.common.rmi.BaseService; + +/** + * @author michele + * + */ +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface CleanerService extends BaseService { + + /** + * @param epr + * an epr of a resultset with dirty records + * @param ruleId + * the identifier of a rule + * @return an epr of a resultset with clean records + * @throws CleanerException + */ + W3CEndpointReference clean(final W3CEndpointReference epr, final String ruleId) throws CleanerException; + +} diff --git a/dnet-data-services/src/main/resources/eu/dnetlib/data/cleaner/applicationContext-cleaner.properties b/dnet-data-services/src/main/resources/eu/dnetlib/data/cleaner/applicationContext-cleaner.properties new file mode 100644 index 0000000..5bca525 --- /dev/null +++ b/dnet-data-services/src/main/resources/eu/dnetlib/data/cleaner/applicationContext-cleaner.properties @@ -0,0 +1 @@ +service.cleaner.mapped.resultset.factory=mappedResultSetFactory \ No newline at end of file diff --git a/dnet-data-services/src/main/resources/eu/dnetlib/data/cleaner/applicationContext-cleaner.xml b/dnet-data-services/src/main/resources/eu/dnetlib/data/cleaner/applicationContext-cleaner.xml new file mode 100644 index 0000000..6ec6216 --- /dev/null +++ b/dnet-data-services/src/main/resources/eu/dnetlib/data/cleaner/applicationContext-cleaner.xml @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + diff --git a/dnet-data-services/src/main/resources/eu/dnetlib/data/cleaner/inspector/webContext-cleaner-inspector.xml b/dnet-data-services/src/main/resources/eu/dnetlib/data/cleaner/inspector/webContext-cleaner-inspector.xml new file mode 100644 index 0000000..88b3abc --- /dev/null +++ b/dnet-data-services/src/main/resources/eu/dnetlib/data/cleaner/inspector/webContext-cleaner-inspector.xml @@ -0,0 +1,19 @@ + + + + + + + + + + + + diff --git a/dnet-data-services/src/main/resources/eu/dnetlib/enabling/views/inspector/cleaner.st b/dnet-data-services/src/main/resources/eu/dnetlib/enabling/views/inspector/cleaner.st new file mode 100644 index 0000000..f8ec69c --- /dev/null +++ b/dnet-data-services/src/main/resources/eu/dnetlib/enabling/views/inspector/cleaner.st @@ -0,0 +1,36 @@ +$inspector/master(it={ + + + +

Browse indices

+ +
+Cleaner rules: +

+ +Dirty Record:
+ +

+ +
+
+ +Cleaned Record:
+ + + +})$ \ No newline at end of file diff --git a/dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/CleanerServiceImplTest.java b/dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/CleanerServiceImplTest.java new file mode 100644 index 0000000..c722b12 --- /dev/null +++ b/dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/CleanerServiceImplTest.java @@ -0,0 +1,67 @@ +package eu.dnetlib.data.utility.cleaner; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import javax.xml.ws.wsaddressing.W3CEndpointReference; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.runners.MockitoJUnit44Runner; + +import eu.dnetlib.data.utility.cleaner.rmi.CleanerException; +import eu.dnetlib.enabling.resultset.MappedResultSetFactory; +import eu.dnetlib.test.utils.EPRTestUtil; + +@RunWith(MockitoJUnit44Runner.class) +public class CleanerServiceImplTest { + + /** + * Class under test. + */ + private CleanerServiceImpl service; + + @Mock + private CleaningRuleFactory cleaningRuleFactory; + @Mock + private MappedResultSetFactory mappedResultSetFactory; + @Mock + private CleaningRule cleaningRule; + + private W3CEndpointReference epr_IN = EPRTestUtil.getTestEpr("http://1"); + private W3CEndpointReference epr_OUT = EPRTestUtil.getTestEpr("http://2"); + + private static final String RULE_ID = "RULE_01"; + + @Before + public void setUp() throws Exception { + when(cleaningRuleFactory.obtainCleaningRule(RULE_ID)).thenReturn(cleaningRule); + when(mappedResultSetFactory.createMappedResultSet(epr_IN, cleaningRule)).thenReturn(epr_OUT); + + service = new CleanerServiceImpl(); + service.setCleaningRuleFactory(cleaningRuleFactory); + service.setMappedResultSetFactory(mappedResultSetFactory); + } + + @Test + public void testClean() throws CleanerException { + W3CEndpointReference epr = service.clean(epr_IN, RULE_ID); + assertNotNull(epr); + assertEquals(epr_OUT, epr); + verify(cleaningRuleFactory).obtainCleaningRule(RULE_ID); + } + + @Test(expected = CleanerException.class) + public void testClean_null_1() throws CleanerException { + service.clean(epr_IN, null); + } + + @Test(expected = CleanerException.class) + public void testClean_null_2() throws CleanerException { + service.clean(null, RULE_ID); + } +} diff --git a/dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/GroovyRuleTest.java b/dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/GroovyRuleTest.java new file mode 100644 index 0000000..b05e48c --- /dev/null +++ b/dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/GroovyRuleTest.java @@ -0,0 +1,135 @@ +package eu.dnetlib.data.utility.cleaner; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.StringReader; +import java.util.List; + +import org.dom4j.Document; +import org.dom4j.Node; +import org.dom4j.io.SAXReader; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.runners.MockitoJUnit44Runner; + +@RunWith(MockitoJUnit44Runner.class) +public class GroovyRuleTest { + + @Test + public void testApplyXpathRule_simple_constant() throws Exception { + final GroovyRule rule = new GroovyRule("'YYYY'"); + + final String xpath = "/a/b"; + + rule.setXpath(xpath); + + final Document doc = execute(rule, "XXXX"); + + assertEquals("YYYY", doc.valueOf(xpath)); + } + + @Test + public void testApplyXpathRule_simple_regex() throws Exception { + final GroovyRule rule = new GroovyRule("(input =~ /X/).replaceAll('Y')"); + + final String xpath = "/a/b"; + + rule.setXpath(xpath); + + final Document doc = execute(rule, "aXaXa"); + + assertEquals("aYaYa", doc.valueOf(xpath)); + } + + @Test + public void testApplyXpathRule_simple_upper() throws Exception { + final GroovyRule rule = new GroovyRule("input.toUpperCase()"); + + final String xpath = "/a/b"; + + rule.setXpath(xpath); + + final Document doc = execute(rule, "xyz"); + + assertEquals("XYZ", doc.valueOf(xpath)); + } + + @Test + public void testApplyXpathRule_multi() throws Exception { + final GroovyRule rule = new GroovyRule("'Y'"); + + final String xpath = "/a/b"; + + rule.setXpath(xpath); + + final Document doc = execute(rule, "XXX"); + + List list = doc.selectNodes(xpath); + + assertEquals(3, list.size()); + for (Object o : list) { + assertEquals("Y", ((Node) o).getText()); + } + + } + + @Test + public void testApplyXpathRule_singleAttr() throws Exception { + final GroovyRule rule = new GroovyRule("'BBBB'"); + + final String xpath = "/a/b/@value"; + + rule.setXpath(xpath); + + final Document doc = execute(rule, "XXXX"); + + assertEquals("BBBB", doc.valueOf(xpath)); + assertEquals("XXXX", doc.valueOf("/a/b")); + } + + @Test + public void testApplyXpathRule_multiAttr() throws Exception { + final GroovyRule rule = new GroovyRule("'B'"); + + final String xpath = "/a/b/@value"; + + rule.setXpath(xpath); + + final Document doc = execute(rule, ""); + + final List list = doc.selectNodes(xpath); + + assertEquals(3, list.size()); + for (Object o : list) { + assertEquals("B", ((Node) o).getText()); + } + } + + @Test + public void testApplyXpathRule_complex() throws Exception { + final GroovyRule rule = new GroovyRule("'B'"); + + final String xpath = "/a/b"; + + rule.setXpath(xpath); + + final Document doc = execute(rule, "XC"); + + assertTrue(doc.valueOf(xpath).contains("B")); + assertEquals("C", doc.valueOf("/a/b/c")); + } + + private Document execute(final GroovyRule rule, final String xml) throws Exception { + + final SAXReader reader = new SAXReader(); + final Document doc = reader.read(new StringReader(xml)); + + System.out.println("BEFORE:\n" + doc.asXML() + "\n"); + rule.applyXpathRule(doc); + System.out.println("AFTER:\n" + doc.asXML() + "\n"); + + System.out.println("-----------------------------\n"); + return doc; + } +} diff --git a/dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/VocabularyRuleTest.java b/dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/VocabularyRuleTest.java new file mode 100644 index 0000000..31185aa --- /dev/null +++ b/dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/VocabularyRuleTest.java @@ -0,0 +1,128 @@ +package eu.dnetlib.data.utility.cleaner; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.mockito.Matchers.anyString; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.io.StringReader; +import java.util.List; + +import org.dom4j.Document; +import org.dom4j.io.SAXReader; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.runners.MockitoJUnit44Runner; + +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + +@RunWith(MockitoJUnit44Runner.class) +public class VocabularyRuleTest { + + private static final String VOCABULARY_NAME_1 = "TEST VOCABULARY 1"; + private static final String VOCABULARY_NAME_2 = "TEST VOCABULARY 2"; + private static final List VOCABULARY = Lists.newArrayList("XXXX|-:-|AAAA", "YYYY|-:-|AAAA", "ZZZZ|-:-|AAAA"); + + /** + * Class Under Test + */ + private VocabularyRule rule; + + @Mock + private ISLookUpService lookup; + + @Before + public void setUp() throws Exception { + when(lookup.quickSearchProfile(anyString())).thenReturn(VOCABULARY); + + rule = new VocabularyRule(Sets.newHashSet(VOCABULARY_NAME_1, VOCABULARY_NAME_2), lookup); + } + + @Test + public void testSetup() throws Exception { + final String xpath = "/a/b"; + rule.setXpath(xpath); + + execute("XXXX"); + + verify(lookup, times(2)).quickSearchProfile(anyString()); + assertEquals(VOCABULARY.size(), rule.getVocabularyTerms().size()); + } + + @Test + public void testApplyXpathRule() throws Exception { + final String xpath = "/a/b"; + rule.setXpath(xpath); + final Document doc = execute("XXXX"); + assertEquals("AAAA", doc.valueOf(xpath)); + assertNull(rule.verifyValue("AAAA")); + assertNotNull(rule.verifyValue("XXXX")); + } + + @Test + public void testApplyXpathRule_2() throws Exception { + final String xpath = "/a/b"; + rule.setXpath(xpath); + final Document doc = execute("XXXX"); + assertEquals("AAAA", doc.valueOf(xpath)); + assertNull(rule.verifyValue("AAAA")); + assertNotNull(rule.verifyValue("XXXX")); + } + + @Test + public void testApplyXpathRule_3() throws Exception { + final String xpath = "/a/b"; + rule.setXpath(xpath); + final Document doc = execute("XXXX"); + assertEquals("AAAA", doc.valueOf(xpath)); + } + + @Test + public void testApplyXpathRule_attr() throws Exception { + final String xpath = "/a/b/@value"; + rule.setXpath(xpath); + final Document doc = execute(""); + assertEquals("AAAA", doc.valueOf(xpath)); + assertNull(rule.verifyValue("AAAA")); + assertNotNull(rule.verifyValue("XXXX")); + } + + @Test + public void testApplyXpathRule_with_spaces() throws Exception { + final String xpath = "/a/b"; + rule.setXpath(xpath); + final Document doc = execute(" XXXX "); + assertEquals("AAAA", doc.valueOf(xpath)); + assertNull(rule.verifyValue("AAAA")); + assertNotNull(rule.verifyValue(" XXXX ")); + } + + @Test + public void testApplyXpathRule_case() throws Exception { + final String xpath = "/a/b"; + rule.setXpath(xpath); + final Document doc = execute("Xxxx"); + assertEquals("AAAA", doc.valueOf(xpath)); + assertNull(rule.verifyValue("AAAA")); + assertNotNull(rule.verifyValue("Xxxx")); + } + + private Document execute(final String xml) throws Exception { + final SAXReader reader = new SAXReader(); + final Document doc = reader.read(new StringReader(xml)); + System.out.println("BEFORE:\n" + doc.asXML() + "\n"); + rule.applyXpathRule(doc); + System.out.println("AFTER:\n" + doc.asXML() + "\n"); + System.out.println("-----------------------------\n"); + return doc; + } + +} diff --git a/dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/XMLCleaningRuleTest.java b/dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/XMLCleaningRuleTest.java new file mode 100644 index 0000000..32e3297 --- /dev/null +++ b/dnet-data-services/src/test/resources/eu/dnetlib/data/utility/cleaner/XMLCleaningRuleTest.java @@ -0,0 +1,72 @@ +package eu.dnetlib.data.utility.cleaner; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.util.HashMap; +import java.util.Map; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.runners.MockitoJUnit44Runner; + +import com.google.common.collect.Lists; + +import eu.dnetlib.data.utility.cleaner.rmi.CleanerException; + +@RunWith(MockitoJUnit44Runner.class) +public class XMLCleaningRuleTest { + + private static final String INPUT_VALID = "" + "
" + " " + " HELLO" + " " + ""; + + private static final String INPUT_INVALID = "" + "
" + " " + " GOOD BYE" + " " + ""; + + /** + * Class under test. + */ + private CleaningRule xmlRule; + private XPATHCleaningRule mockXpathRule = new XPATHCleaningRule() { + + @Override + protected Map verifyValue(final String value) throws CleanerException { + if (value.equals("CIAO")) { return null; } + + Map err = new HashMap(); + err.put("term", value); + return err; + } + + @Override + protected String calculateNewValue(final String oldValue) throws CleanerException { + if (oldValue.equals("HELLO")) { return "CIAO"; } + return oldValue; + } + };; + + @Before + public void setUp() throws Exception { + xmlRule = new CleaningRule(); + + mockXpathRule.setStrict(true); + mockXpathRule.setXpath("//a"); + + xmlRule.setXpathRules(Lists.newArrayList(mockXpathRule)); + } + + @Test + public void testEvaluate_valid() { + String s = xmlRule.evaluate(INPUT_VALID); + assertTrue(s.contains("CIAO")); + assertFalse(s.contains("invalid")); + } + + @Test + public void testEvaluate_invalid() { + String s = xmlRule.evaluate(INPUT_INVALID); + System.out.println(s); + assertFalse(s.contains("CIAO")); + assertTrue(s.contains("invalid")); + } + +} diff --git a/pom.xml b/pom.xml index 0348614..245b516 100644 --- a/pom.xml +++ b/pom.xml @@ -155,6 +155,13 @@ + + org.codehaus.groovy + groovy + 2.1.6 + + + com.jcraft jsch