dnet-core/dnet-data-services/src/main/java/eu/dnetlib/data/utility/cleaner/XPATHCleaningRule.java

78 lines
2.0 KiB
Java

package eu.dnetlib.data.utility.cleaner;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.Node;
import org.springframework.beans.factory.annotation.Required;
import com.google.common.collect.Lists;
import eu.dnetlib.data.utility.cleaner.rmi.CleanerException;
public abstract class XPATHCleaningRule {
private String xpath;
private boolean strict = false;
private static final Log logCleaningRules = LogFactory.getLog("VOCABULARY_RULES");
public List<Map<String, String>> applyXpathRule(final Document doc) throws CleanerException {
final List<Map<String, String>> errors = Lists.newArrayList();
final String id = doc.valueOf("//*[local-name()='objIdentifier']");
for (Object o : doc.selectNodes(xpath)) {
final Node node = (Node) o;
final String oldValue = node.getText().trim();
final String newValue = calculateNewValue(oldValue);
if (strict) {
final Map<String, String> err = verifyValue(newValue);
if (err != null) {
errors.add(err);
if (logCleaningRules.isInfoEnabled()) {
logCleaningRules.info("[" + newValue + "] is INVALID, " + "RULE: " + toString() + ", " + "RECORD: " + id + ", " + "XPATH: "
+ this.getXpath());
}
}
}
if (logCleaningRules.isInfoEnabled() && !newValue.equals(oldValue)) {
logCleaningRules.info("[" + oldValue + "] => [" + newValue + "], " + toString() + ", " + "RECORD: " + id + ", " + "XPATH: " + this.getXpath());
}
node.setText(newValue);
}
return errors;
}
protected abstract Map<String, String> verifyValue(final String value) throws CleanerException;
protected abstract String calculateNewValue(final String oldValue) throws CleanerException;
public String getXpath() {
return xpath;
}
@Required
public void setXpath(final String xpath) {
this.xpath = xpath;
}
public boolean isStrict() {
return strict;
}
public void setStrict(final boolean strict) {
this.strict = strict;
}
}