1
0
Fork 0

fixed PersonCleaner extension functions

This commit is contained in:
Claudio Atzori 2021-04-27 10:10:06 +02:00
parent ef4bfd82e2
commit fa42026590
3 changed files with 15 additions and 35 deletions

View File

@ -156,7 +156,7 @@ public class TransformSparkJobNode {
* @return
*/
private static int getRepartitionNumber(long totalInput, Integer rpt) {
return (int) (totalInput / rpt);
return Math.max(1, (int) (totalInput / rpt));
}
}

View File

@ -4,7 +4,6 @@ package eu.dnetlib.dhp.transformation.xslt;
import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
import java.io.Serializable;
// import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.List;
@ -18,22 +17,10 @@ import com.google.common.hash.Hashing;
import eu.dnetlib.dhp.transformation.xslt.utils.Capitalize;
import eu.dnetlib.dhp.transformation.xslt.utils.DotAbbreviations;
import net.sf.saxon.s9api.ExtensionFunction;
import net.sf.saxon.s9api.ItemType;
import net.sf.saxon.s9api.OccurrenceIndicator;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.SequenceType;
import net.sf.saxon.s9api.XdmValue;
//import eu.dnetlib.pace.clustering.NGramUtils;
//import eu.dnetlib.pace.util.Capitalise;
//import eu.dnetlib.pace.util.DotAbbreviations;
import net.sf.saxon.s9api.*;
public class PersonCleaner implements ExtensionFunction, Serializable {
/**
*
*/
private static final long serialVersionUID = 1L;
private List<String> firstname = Lists.newArrayList();
private List<String> surname = Lists.newArrayList();
@ -45,7 +32,7 @@ public class PersonCleaner implements ExtensionFunction, Serializable {
}
public String normalize(String s) {
private String normalize(String s) {
s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
s = s.replaceAll("\\(.+\\)", "");
s = s.replaceAll("\\[.+\\]", "");
@ -184,7 +171,7 @@ public class PersonCleaner implements ExtensionFunction, Serializable {
@Override
public QName getName() {
return new QName(QNAME_BASE_URI + "/person", "person");
return new QName(QNAME_BASE_URI + "/person", "normalize");
}
@Override
@ -194,13 +181,18 @@ public class PersonCleaner implements ExtensionFunction, Serializable {
@Override
public SequenceType[] getArgumentTypes() {
// TODO Auto-generated method stub
return null;
return new SequenceType[] {
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
};
}
@Override
public XdmValue call(XdmValue[] arguments) throws SaxonApiException {
// TODO Auto-generated method stub
return null;
public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
XdmValue r = xdmValues[0];
if (r.size() == 0) {
return new XdmAtomicValue("");
}
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
return new XdmAtomicValue(normalize(currentValue));
}
}

View File

@ -68,12 +68,6 @@
<xsl:call-template name="validRecord" />
</xsl:template>
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template name="validRecord">
<record>
<xsl:apply-templates select="//*[local-name() = 'header']" />
@ -282,9 +276,6 @@
<xsl:value-of select="$varEmbargoEndDate"/>
</oaf:embargoenddate>
</xsl:when>
<xsl:otherwise>
<xsl:call-template name="terminate"/>
</xsl:otherwise>
</xsl:choose>
</xsl:if>
@ -310,9 +301,6 @@
</dr:CobjCategory>
-->
</xsl:when>
<xsl:otherwise>
<xsl:call-template name="terminate"/>
</xsl:otherwise>
</xsl:choose>
<!-- review status -->