forked from antonis.lempesis/dnet-hadoop
fixed PersonCleaner extension functions
This commit is contained in:
parent
ef4bfd82e2
commit
fa42026590
|
@ -156,7 +156,7 @@ public class TransformSparkJobNode {
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private static int getRepartitionNumber(long totalInput, Integer rpt) {
|
private static int getRepartitionNumber(long totalInput, Integer rpt) {
|
||||||
return (int) (totalInput / rpt);
|
return Math.max(1, (int) (totalInput / rpt));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,6 @@ package eu.dnetlib.dhp.transformation.xslt;
|
||||||
import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
|
import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
// import java.nio.charset.Charset;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -18,22 +17,10 @@ import com.google.common.hash.Hashing;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.transformation.xslt.utils.Capitalize;
|
import eu.dnetlib.dhp.transformation.xslt.utils.Capitalize;
|
||||||
import eu.dnetlib.dhp.transformation.xslt.utils.DotAbbreviations;
|
import eu.dnetlib.dhp.transformation.xslt.utils.DotAbbreviations;
|
||||||
import net.sf.saxon.s9api.ExtensionFunction;
|
import net.sf.saxon.s9api.*;
|
||||||
import net.sf.saxon.s9api.ItemType;
|
|
||||||
import net.sf.saxon.s9api.OccurrenceIndicator;
|
|
||||||
import net.sf.saxon.s9api.QName;
|
|
||||||
import net.sf.saxon.s9api.SaxonApiException;
|
|
||||||
import net.sf.saxon.s9api.SequenceType;
|
|
||||||
import net.sf.saxon.s9api.XdmValue;
|
|
||||||
|
|
||||||
//import eu.dnetlib.pace.clustering.NGramUtils;
|
|
||||||
//import eu.dnetlib.pace.util.Capitalise;
|
|
||||||
//import eu.dnetlib.pace.util.DotAbbreviations;
|
|
||||||
|
|
||||||
public class PersonCleaner implements ExtensionFunction, Serializable {
|
public class PersonCleaner implements ExtensionFunction, Serializable {
|
||||||
/**
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
private static final long serialVersionUID = 1L;
|
private static final long serialVersionUID = 1L;
|
||||||
private List<String> firstname = Lists.newArrayList();
|
private List<String> firstname = Lists.newArrayList();
|
||||||
private List<String> surname = Lists.newArrayList();
|
private List<String> surname = Lists.newArrayList();
|
||||||
|
@ -45,7 +32,7 @@ public class PersonCleaner implements ExtensionFunction, Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String normalize(String s) {
|
private String normalize(String s) {
|
||||||
s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
|
s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
|
||||||
s = s.replaceAll("\\(.+\\)", "");
|
s = s.replaceAll("\\(.+\\)", "");
|
||||||
s = s.replaceAll("\\[.+\\]", "");
|
s = s.replaceAll("\\[.+\\]", "");
|
||||||
|
@ -184,7 +171,7 @@ public class PersonCleaner implements ExtensionFunction, Serializable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public QName getName() {
|
public QName getName() {
|
||||||
return new QName(QNAME_BASE_URI + "/person", "person");
|
return new QName(QNAME_BASE_URI + "/person", "normalize");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -194,13 +181,18 @@ public class PersonCleaner implements ExtensionFunction, Serializable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SequenceType[] getArgumentTypes() {
|
public SequenceType[] getArgumentTypes() {
|
||||||
// TODO Auto-generated method stub
|
return new SequenceType[] {
|
||||||
return null;
|
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public XdmValue call(XdmValue[] arguments) throws SaxonApiException {
|
public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
|
||||||
// TODO Auto-generated method stub
|
XdmValue r = xdmValues[0];
|
||||||
return null;
|
if (r.size() == 0) {
|
||||||
|
return new XdmAtomicValue("");
|
||||||
|
}
|
||||||
|
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
||||||
|
return new XdmAtomicValue(normalize(currentValue));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -68,12 +68,6 @@
|
||||||
<xsl:call-template name="validRecord" />
|
<xsl:call-template name="validRecord" />
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template name="terminate">
|
|
||||||
<xsl:message terminate="yes">
|
|
||||||
record is not compliant, transformation is interrupted.
|
|
||||||
</xsl:message>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template name="validRecord">
|
<xsl:template name="validRecord">
|
||||||
<record>
|
<record>
|
||||||
<xsl:apply-templates select="//*[local-name() = 'header']" />
|
<xsl:apply-templates select="//*[local-name() = 'header']" />
|
||||||
|
@ -282,9 +276,6 @@
|
||||||
<xsl:value-of select="$varEmbargoEndDate"/>
|
<xsl:value-of select="$varEmbargoEndDate"/>
|
||||||
</oaf:embargoenddate>
|
</oaf:embargoenddate>
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
<xsl:otherwise>
|
|
||||||
<xsl:call-template name="terminate"/>
|
|
||||||
</xsl:otherwise>
|
|
||||||
</xsl:choose>
|
</xsl:choose>
|
||||||
</xsl:if>
|
</xsl:if>
|
||||||
|
|
||||||
|
@ -310,9 +301,6 @@
|
||||||
</dr:CobjCategory>
|
</dr:CobjCategory>
|
||||||
-->
|
-->
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
<xsl:otherwise>
|
|
||||||
<xsl:call-template name="terminate"/>
|
|
||||||
</xsl:otherwise>
|
|
||||||
</xsl:choose>
|
</xsl:choose>
|
||||||
|
|
||||||
<!-- review status -->
|
<!-- review status -->
|
||||||
|
|
Loading…
Reference in New Issue