new xslt adoption and changes in testing xslt's #101

Merged
claudio.atzori merged 4 commits from :hadoop_aggregator into hadoop_aggregator 2021-03-23 08:40:13 +01:00
7 changed files with 828 additions and 69 deletions

View File

@ -0,0 +1,206 @@
package eu.dnetlib.dhp.transformation.xslt;
import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
import java.io.Serializable;
// import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.List;
import java.util.Set;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.hash.Hashing;
import eu.dnetlib.dhp.transformation.xslt.utils.Capitalize;
import eu.dnetlib.dhp.transformation.xslt.utils.DotAbbreviations;
import net.sf.saxon.s9api.ExtensionFunction;
import net.sf.saxon.s9api.ItemType;
import net.sf.saxon.s9api.OccurrenceIndicator;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.SequenceType;
import net.sf.saxon.s9api.XdmValue;
//import eu.dnetlib.pace.clustering.NGramUtils;
//import eu.dnetlib.pace.util.Capitalise;
//import eu.dnetlib.pace.util.DotAbbreviations;
public class PersonCleaner implements ExtensionFunction, Serializable {
/**
*
*/
private static final long serialVersionUID = 1L;
private List<String> firstname = Lists.newArrayList();
private List<String> surname = Lists.newArrayList();
private List<String> fullname = Lists.newArrayList();
private static Set<String> particles = null;
public PersonCleaner() {
}
public String normalize(String s) {
s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
s = s.replaceAll("\\(.+\\)", "");
s = s.replaceAll("\\[.+\\]", "");
s = s.replaceAll("\\{.+\\}", "");
s = s.replaceAll("\\s+-\\s+", "-");
// s = s.replaceAll("[\\W&&[^,-]]", " ");
// System.out.println("class Person: s: " + s);
// s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", " ");
s = s.replaceAll("[\\p{Punct}&&[^-,]]", " ");
s = s.replace("\\d", " ");
s = s.replace("\\n", " ");
s = s.replace("\\.", " ");
s = s.replaceAll("\\s+", " ");
if (s.contains(",")) {
// System.out.println("class Person: s: " + s);
String[] arr = s.split(",");
if (arr.length == 1) {
fullname = splitTerms(arr[0]);
} else if (arr.length > 1) {
surname = splitTerms(arr[0]);
firstname = splitTermsFirstName(arr[1]);
// System.out.println("class Person: surname: " + surname);
// System.out.println("class Person: firstname: " + firstname);
fullname.addAll(surname);
fullname.addAll(firstname);
}
} else {
fullname = splitTerms(s);
int lastInitialPosition = fullname.size();
boolean hasSurnameInUpperCase = false;
for (int i = 0; i < fullname.size(); i++) {
String term = fullname.get(i);
if (term.length() == 1) {
lastInitialPosition = i;
} else if (term.equals(term.toUpperCase())) {
hasSurnameInUpperCase = true;
}
}
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
firstname = fullname.subList(0, lastInitialPosition + 1);
System.out.println("name: " + firstname);
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
for (String term : fullname) {
if (term.length() > 1 && term.equals(term.toUpperCase())) {
surname.add(term);
} else {
firstname.add(term);
}
}
} else if (lastInitialPosition == fullname.size()) {
surname = fullname.subList(lastInitialPosition - 1, fullname.size());
firstname = fullname.subList(0, lastInitialPosition - 1);
}
}
return null;
}
private List<String> splitTermsFirstName(String s) {
List<String> list = Lists.newArrayList();
for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
if (s.trim().matches("\\p{Lu}{2,3}")) {
String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase)
for (String p : parts) {
if (p.length() > 0)
list.add(p);
}
} else {
list.add(part);
}
}
return list;
}
private List<String> splitTerms(String s) {
if (particles == null) {
// particles = NGramUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
}
List<String> list = Lists.newArrayList();
for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
// if (!particles.contains(part.toLowerCase())) {
list.add(part);
// }
}
return list;
}
public List<String> getFirstname() {
return firstname;
}
public List<String> getSurname() {
return surname;
}
public List<String> getFullname() {
return fullname;
}
public String hash() {
return Hashing.murmur3_128().hashString(getNormalisedFullname(), StandardCharsets.UTF_8).toString();
}
public String getNormalisedFullname() {
return isAccurate() ? Joiner.on(" ").join(getSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations())
: Joiner.on(" ").join(fullname);
// return isAccurate() ?
// Joiner.on(" ").join(getCapitalSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) :
// Joiner.on(" ").join(fullname);
}
public List<String> getCapitalSurname() {
return Lists.newArrayList(Iterables.transform(surname, new Capitalize()));
}
public List<String> getNameWithAbbreviations() {
return Lists.newArrayList(Iterables.transform(firstname, new DotAbbreviations()));
}
public boolean isAccurate() {
return (firstname != null && surname != null && !firstname.isEmpty() && !surname.isEmpty());
}
@Override
public QName getName() {
return new QName(QNAME_BASE_URI + "/person", "person");
}
@Override
public SequenceType getResultType() {
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
}
@Override
public SequenceType[] getArgumentTypes() {
// TODO Auto-generated method stub
return null;
}
@Override
public XdmValue call(XdmValue[] arguments) throws SaxonApiException {
// TODO Auto-generated method stub
return null;
}
}

View File

@ -46,6 +46,7 @@ public class XSLTTransformationFunction implements MapFunction<MetadataRecord, M
Processor processor = new Processor(false);
processor.registerExtensionFunction(cleanFunction);
processor.registerExtensionFunction(new DateCleaner());
processor.registerExtensionFunction(new PersonCleaner());
final XsltCompiler comp = processor.newXsltCompiler();
XsltExecutable xslt = comp

View File

@ -0,0 +1,14 @@
package eu.dnetlib.dhp.transformation.xslt.utils;
// import org.apache.commons.text.WordUtils;
// import org.apache.commons.text.WordUtils;
import com.google.common.base.Function;
public class Capitalize implements Function<String, String> {
@Override
public String apply(String s) {
return org.apache.commons.lang3.text.WordUtils.capitalize(s.toLowerCase());
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.transformation.xslt.utils;
import com.google.common.base.Function;
public class DotAbbreviations implements Function<String, String> {
@Override
public String apply(String s) {
return s.length() == 1 ? s + "." : s;
}
}

View File

@ -103,15 +103,18 @@ public class TransformationJobTest extends AbstractVocabularyTest {
}
@Test
@DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite")
@DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite/oaiOpenAIRE")
public void testTransformMostlyUsedScript() throws Exception {
String xslTransformationScript = "";
xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl";
// xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl";
// We Set the input Record getting the XML from the classpath
final MetadataRecord mr = new MetadataRecord();
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml")));
// We Load the XSLT transformation Rule from the classpath
XSLTTransformationFunction tr = loadTransformationRule(
"/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl");
XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript);
MetadataRecord result = tr.call(mr);
@ -120,72 +123,6 @@ public class TransformationJobTest extends AbstractVocabularyTest {
// TODO Create significant Assert
}
@Test
@DisplayName("Test TransformSparkJobNode.main with oaiOpenaire_datacite (v4)")
public void transformTestITGv4OAIdatacite(@TempDir Path testDir) throws Exception {
SparkConf conf = new SparkConf();
conf.setAppName(TransformationJobTest.class.getSimpleName());
conf.setMaster("local");
try (SparkSession spark = SparkSession.builder().config(conf).getOrCreate()) {
final String mdstore_input = this
.getClass()
.getResource("/eu/dnetlib/dhp/transform/mdstorenative")
.getFile();
final String mdstore_output = testDir.toString() + "/version";
mockupTrasformationRule(
"simpleTRule",
"/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl");
final Map<String, String> parameters = Stream.of(new String[][] {
{
"dateOfTransformation", "1234"
},
{
"varOfficialName", "Publications at Bielefeld University"
},
{
"varOfficialId", "opendoar____::2294"
},
{
"transformationPlugin", "XSLT_TRANSFORM"
},
{
"transformationRuleId", "simpleTRule"
},
}).collect(Collectors.toMap(data -> data[0], data -> data[1]));
TransformSparkJobNode.transformRecords(parameters, isLookUpService, spark, mdstore_input, mdstore_output);
// TODO introduce useful assertions
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
final Dataset<MetadataRecord> mOutput = spark
.read()
.format("parquet")
.load(mdstore_output + MDSTORE_DATA_PATH)
.as(encoder);
final Long total = mOutput.count();
final long recordTs = mOutput
.filter((FilterFunction<MetadataRecord>) p -> p.getDateOfTransformation() == 1234)
.count();
final long recordNotEmpty = mOutput
.filter((FilterFunction<MetadataRecord>) p -> !StringUtils.isBlank(p.getBody()))
.count();
assertEquals(total, recordTs);
assertEquals(total, recordNotEmpty);
}
}
@Test
@DisplayName("Test TransformSparkJobNode.main")
public void transformTest(@TempDir Path testDir) throws Exception {

View File

@ -0,0 +1,297 @@
<!-- complete literature v4: xslt_cleaning_REST_OmicsDI ; transformation script production , 2021-03-17 -->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
version="2.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:vocabulary="http://eu/dnetlib/transform/clean"
xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO"
xmlns:personCleaner="http://eu/dnetlib/transform/person"
exclude-result-prefixes="xsl vocabulary dateCleaner personCleaner">
<xsl:param name="varOfficialName" />
<xsl:param name="varDsType" />
<xsl:param name="varDataSourceId" />
<xsl:param name="index" select="0"/>
<xsl:param name="transDate" select="current-dateTime()"/>
<xsl:variable name="vCodes">
<codes>
<code source="arrayexpress-repository" id="re3data_____::r3d100010222" name="ArrayExpress Archive of Functional Genomics Data" sourceUrl="https://www.ebi.ac.uk/arrayexpress/" urlTemplate="https://www.ebi.ac.uk/arrayexpress/experiments/" />
<code source="atlas-experiments" id="re3data_____::r3d100010223" name="Expression Atlas Database" sourceUrl="http://www.ebi.ac.uk/gxa/home" urlTemplate="" />
<code source="biomodels" id="re3data_____::r3d100010789" name="BioModels Database" sourceUrl="https://www.ebi.ac.uk/biomodels-main/" urlTemplate="" />
<code source="dbgap" id="re3data_____::r3d100010788" name="dbGaP (database of Genotypes and Phenotypes)" sourceUrl="https://www.ncbi.nlm.nih.gov/gap" urlTemplate="" />
<code source="ega" id="re3data_____::r3d100011242" name="EGA Database (European Genome-phenome Archive)" sourceUrl="https://ega-archive.org" urlTemplate="" />
<code source="eva" id="re3data_____::r3d100011553" name="EVA database (European Variation Archive)" sourceUrl="https://www.ebi.ac.uk/eva/" urlTemplate="" />
<code source="geo" id="re3data_____::r3d100010283" name="GEO (Gene Expression Omnibus)" sourceUrl="https://www.ncbi.nlm.nih.gov/geo/" urlTemplate="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" />
<code source="gnps" id="omicsdi_____::gnps" name="GNPS Database (Global Natural Products Social Molecular Networking)" sourceUrl="https://gnps.ucsd.edu/ProteoSAFe/static/gnps-splash2.jsp" urlTemplate="" />
<code source="gpmdb" id="re3data_____::r3d100010883" name="GPMDB (Global Proteome Machine)" sourceUrl="http://gpmdb.thegpm.org/" urlTemplate="http://gpmdb.thegpm.org/~/dblist_gpmnum/gpmnum=" />
<code source="jpost" id="re3data_____::r3d100012349" name="JPOST Repository (Japan ProteOme STandard Repository/Database)" sourceUrl="https://jpostdb.org/" urlTemplate="https://repository.jpostdb.org/entry/JPST000228" />
<code source="lincs" id="re3data_____::r3d100011833" name="LINCS (Big Data to Knowledge / Library of Integrated Network-based Cellular Signatures)" sourceUrl="http://lincsportal.ccs.miami.edu/dcic-portal/" urlTemplate="http://lincsportal.ccs.miami.edu/datasets/#/view/" />
<code source="massive" id="omicsdi_____::massive" name="MassIVE Database (Mass Spectrometry Interactive Virtual Environment)" sourceUrl="https://massive.ucsd.edu/ProteoSAFe/datasets.jsp" urlTemplate="" />
<code source="metabolights_dataset" id="opendoar____::2970" name="MetaboLights Database" sourceUrl="http://www.ebi.ac.uk/metabolights/" urlTemplate="" />
<code source="metabolome_express" id="omicsdi_____::metabolome" name="MetabolomeExpress" sourceUrl="https://www.metabolome-express.org/" urlTemplate="https://www.metabolome-express.org/datasetview.php?datasetid=" />
<code source="metabolomics_workbench" id="re3data_____::r3d100012314" name="Metabolomics Workbench Database" sourceUrl="http://www.metabolomicsworkbench.org/" urlTemplate="http://www.metabolomicsworkbench.org/data/DRCCMetadata.php?StudyID=" />
<code source="NCBI" id="omicsdi_____::ncbi" name="NCBI" sourceUrl="https://www.ncbi.nlm.nih.gov/bioproject/" urlTemplate="https://www.ncbi.nlm.nih.gov/bioproject/" />
<code source="omics_ena_project" id="re3data_____::r3d100010527" name="ENA (European Nucleotide Archive)" sourceUrl="https://www.ebi.ac.uk/ena" urlTemplate="https://www.ebi.ac.uk/ena/data/view/" />
<code source="paxdb" id="omicsdi_____::paxdb" name="PAXDB (protein abundance database)" sourceUrl="http://pax-db.org/" urlTemplate="" />
<code source="peptide_atlas" id="re3data_____::r3d100010889" name="PeptideAtlas Database" sourceUrl="http://www.peptideatlas.org/" urlTemplate="" />
<code source="pride" id="re3data_____::r3d100010137" name="PRIDE Database (PRoteomics IDEntifications)" sourceUrl="http://www.ebi.ac.uk/pride/archive/" urlTemplate="http://www.ebi.ac.uk/pride/archive/projects/PXD008134" />
</codes>
</xsl:variable>
<!--
gnps, jpost, massive, metabolome_express, paxdb: no id/OpenAIRE-entry found
ncbi: several OpenAIRE-entries found - is one the right?
-->
<xsl:key name="kCodeByName" match="code" use="string(@source)"/>
<xsl:template match="/">
<xsl:variable name="datasourcePrefix"
select="normalize-space(//oaf:datasourceprefix)" />
<xsl:call-template name="validRecord" />
</xsl:template>
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template name="validRecord">
<record>
<xsl:apply-templates select="//*[local-name() = 'header']" />
<metadata>
<!--
<xsl:apply-templates select="//*[local-name() = 'metadata']//*[local-name() = 'datasets']"/>
-->
<datacite:resource>
<!-- OmicsDI does not state: languages, projects,
-->
<!-- landing page -->
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
<datacite:alternateIdentifiers>
<datacite:alternateIdentifier>
<xsl:attribute name="alternateIdentifierType">
<xsl:value-of select="'LandingPage'"/>
</xsl:attribute>
<xsl:choose>
<xsl:when test="//*[local-name() = 'source'][. = ('gnps','massive','paxdb','peptide_atlas')]">
<xsl:value-of select="concat('https://www.omicsdi.org/#/dataset/', //*[local-name() = 'source'], '/', //*[local-name() = 'id'])"/>
</xsl:when>
<xsl:when test="//*[local-name() = 'source'][. = 'metabolome_express']">
<xsl:value-of select="concat(key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@urlTemplate, substring-after(//*[local-name()='id'], 'MEX'))"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="concat(key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@urlTemplate, //*[local-name()='id'])"/>
</xsl:otherwise>
</xsl:choose>
</datacite:alternateIdentifier>
<datacite:alternateIdentifier>
<xsl:attribute name="alternateIdentifierType">
<xsl:value-of select="'local'"/>
</xsl:attribute>
<xsl:value-of select="//*[local-name()='id']"/>
</datacite:alternateIdentifier>
</datacite:alternateIdentifiers>
</xsl:if>
<!-- identifier; ... -->
<!-- URL hindered by _et_: https://www.omicsdi.org/ws/dataset/get_acc=E-MTAB-6546_et_database=arrayexpress-repository -->
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
<datacite:identifier>
<xsl:attribute name="identifierType">
<xsl:value-of select="'URL'"/>
</xsl:attribute>
<xsl:value-of select="concat('https://www.omicsdi.org/dataset/', //*[local-name() = 'source'], '/', //*[local-name() = 'id'])"/>
</datacite:identifier>
</xsl:if>
<!-- title -->
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
<datacite:titles>
<datacite:title>
<xsl:value-of select="//*[local-name() = 'title']"/>
</datacite:title>
</datacite:titles>
</xsl:if>
<!-- no authors in OmicsDI -->
<!--
<xsl:call-template name="authors" />
-->
<!--
<xsl:call-template name="relatedPaper" />
-->
<datacite:descriptions>
<datacite:description>
<xsl:attribute name="descriptionType">
<xsl:value-of select="'Abstract'"/>
</xsl:attribute>
<xsl:value-of select="//*[local-name() = 'description']"/>
</datacite:description>
</datacite:descriptions>
<!-- subject -->
<datacite:subjects>
<xsl:for-each select="distinct-values(//*[local-name()='omicsType'])">
<datacite:subject>
<xsl:value-of select="."/>
</datacite:subject>
</xsl:for-each>
</datacite:subjects>
</datacite:resource>
<xsl:choose>
<xsl:when test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
<xsl:variable name='varCobjCategory'
select="'0021'" />
<dr:CobjCategory>
<xsl:attribute name="type">
<xsl:value-of select="vocabulary:clean($varCobjCategory, 'dnet:result_typologies')"/>
</xsl:attribute>
<xsl:value-of
select="$varCobjCategory" />
</dr:CobjCategory>
</xsl:when>
<xsl:otherwise>
<!--
<xsl:call-template name="terminate"/>
-->
</xsl:otherwise>
</xsl:choose>
<!--
// review status: no review indications found so far
-->
<!--
OMICSDI is including both open and controlled data source.
-->
<oaf:accessrights>
<xsl:text>UNKNOWN</xsl:text>
</oaf:accessrights>
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='NeuroVault__']">
<oaf:concept>
<xsl:attribute name="id">
<xsl:value-of select="'ni'"/>
</xsl:attribute>
</oaf:concept>
</xsl:if>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@name"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@id"/>
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:collectedFrom>
<!-- date -->
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
<oaf:dateAccepted>
<xsl:value-of select="replace(//*[local-name() = 'publicationDate'][not(.='null')],'(\d{4})(\d{2})(\d{2})','$1-$2-$3')"/>
</oaf:dateAccepted>
</xsl:if>
</metadata>
<xsl:copy-of select="//*[local-name() = 'about']" />
</record>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="//*[local-name() = 'metadata']//*[local-name() = 'datasets']">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="//*[local-name() = 'header']">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate"/>
</xsl:element>
</xsl:copy>
</xsl:template>
<!--
no authors findable in OmicsDI">
-->
<!--
<xsl:template match="//*[local-name() = 'authors']">
-->
<xsl:template name="authors">
<xsl:choose>
<xsl:when test="not(//*[local-name() = 'authors'][string-length(normalize-space(.)) > 0 and not(. = 'null')])">
<xsl:call-template name="terminate" />
</xsl:when>
<xsl:otherwise>
<xsl:for-each select="tokenize(//*[local-name() = 'authors'], '(, and |,| and )')">
<xsl:element name="datacite:creator">
<xsl:element name="datacite:creatorName">
<xsl:value-of select="TransformationFunction:convertString($tf, ., 'Person')"/>
</xsl:element>
<xsl:element name="datacite:givenName">
<xsl:value-of select="normalize-space(substring-after(personCleaner:normalize(.), ','))"/>
</xsl:element>
<xsl:element name="datacite:familyName">
<xsl:value-of select="substring-before(personCleaner:normalize(.), ',')"/>
</xsl:element>
</xsl:element>
</xsl:for-each>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<!--
<xsl:template match="//*[local-name() = 'DOI']">
-->
<xsl:template name="relatedPaper">
<xsl:element name="datacite:relatedIdentifier">
<xsl:attribute name="relatedIdentifierType">
<xsl:value-of select="'DOI'"/>
</xsl:attribute>
<xsl:attribute name="relationType">
<xsl:value-of select="'isReferencedBy'"/>
</xsl:attribute>
<xsl:value-of select="//*[local-name() = 'DOI']"/>
</xsl:element>
</xsl:template>
</xsl:stylesheet>

View File

@ -0,0 +1,292 @@
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.1"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:TransformationFunction="eu.dnetlib.data.collective.transformation.core.xsl.ext.TransformationFunctionProxy"
extension-element-prefixes="TransformationFunction"
exclude-result-prefixes="TransformationFunction">
<xsl:param name="varOfficialName" />
<xsl:param name="varDsType" />
<xsl:param name="varDataSourceId" />
<xsl:param name="index" select="0"/>
<xsl:param name="transDate" select="current-dateTime()"/>
<xsl:variable name="tf" select="TransformationFunction:getInstance()"/>
<xsl:variable name="vCodes">
<codes>
<code source="arrayexpress-repository" id="re3data_____::r3d100010222" name="ArrayExpress Archive of Functional Genomics Data" sourceUrl="https://www.ebi.ac.uk/arrayexpress/" urlTemplate="https://www.ebi.ac.uk/arrayexpress/experiments/" />
<code source="atlas-experiments" id="re3data_____::r3d100010223" name="Expression Atlas Database" sourceUrl="http://www.ebi.ac.uk/gxa/home" urlTemplate="" />
<code source="biomodels" id="re3data_____::r3d100010789" name="BioModels Database" sourceUrl="https://www.ebi.ac.uk/biomodels-main/" urlTemplate="" />
<code source="dbgap" id="re3data_____::r3d100010788" name="dbGaP (database of Genotypes and Phenotypes)" sourceUrl="https://www.ncbi.nlm.nih.gov/gap" urlTemplate="" />
<code source="ega" id="re3data_____::r3d100011242" name="EGA Database (European Genome-phenome Archive)" sourceUrl="https://ega-archive.org" urlTemplate="" />
<code source="eva" id="re3data_____::r3d100011553" name="EVA database (European Variation Archive)" sourceUrl="https://www.ebi.ac.uk/eva/" urlTemplate="" />
<code source="geo" id="re3data_____::r3d100010283" name="GEO (Gene Expression Omnibus)" sourceUrl="https://www.ncbi.nlm.nih.gov/geo/" urlTemplate="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" />
<code source="gnps" id="omicsdi_____::gnps" name="GNPS Database (Global Natural Products Social Molecular Networking)" sourceUrl="https://gnps.ucsd.edu/ProteoSAFe/static/gnps-splash2.jsp" urlTemplate="" />
<code source="gpmdb" id="re3data_____::r3d100010883" name="GPMDB (Global Proteome Machine)" sourceUrl="http://gpmdb.thegpm.org/" urlTemplate="http://gpmdb.thegpm.org/~/dblist_gpmnum/gpmnum=" />
<code source="jpost" id="re3data_____::r3d100012349" name="JPOST Repository (Japan ProteOme STandard Repository/Database)" sourceUrl="https://jpostdb.org/" urlTemplate="https://repository.jpostdb.org/entry/JPST000228" />
<code source="lincs" id="re3data_____::r3d100011833" name="LINCS (Big Data to Knowledge / Library of Integrated Network-based Cellular Signatures)" sourceUrl="http://lincsportal.ccs.miami.edu/dcic-portal/" urlTemplate="http://lincsportal.ccs.miami.edu/datasets/#/view/" />
<code source="massive" id="omicsdi_____::massive" name="MassIVE Database (Mass Spectrometry Interactive Virtual Environment)" sourceUrl="https://massive.ucsd.edu/ProteoSAFe/datasets.jsp" urlTemplate="" />
<code source="metabolights_dataset" id="opendoar____::2970" name="MetaboLights Database" sourceUrl="http://www.ebi.ac.uk/metabolights/" urlTemplate="" />
<code source="metabolome_express" id="omicsdi_____::metabolome" name="MetabolomeExpress" sourceUrl="https://www.metabolome-express.org/" urlTemplate="https://www.metabolome-express.org/datasetview.php?datasetid=" />
<code source="metabolomics_workbench" id="re3data_____::r3d100012314" name="Metabolomics Workbench Database" sourceUrl="http://www.metabolomicsworkbench.org/" urlTemplate="http://www.metabolomicsworkbench.org/data/DRCCMetadata.php?StudyID=" />
<code source="NCBI" id="omicsdi_____::ncbi" name="NCBI" sourceUrl="https://www.ncbi.nlm.nih.gov/bioproject/" urlTemplate="https://www.ncbi.nlm.nih.gov/bioproject/" />
<code source="omics_ena_project" id="re3data_____::r3d100010527" name="ENA (European Nucleotide Archive)" sourceUrl="https://www.ebi.ac.uk/ena" urlTemplate="https://www.ebi.ac.uk/ena/data/view/" />
<code source="paxdb" id="omicsdi_____::paxdb" name="PAXDB (protein abundance database)" sourceUrl="http://pax-db.org/" urlTemplate="" />
<code source="peptide_atlas" id="re3data_____::r3d100010889" name="PeptideAtlas Database" sourceUrl="http://www.peptideatlas.org/" urlTemplate="" />
<code source="pride" id="re3data_____::r3d100010137" name="PRIDE Database (PRoteomics IDEntifications)" sourceUrl="http://www.ebi.ac.uk/pride/archive/" urlTemplate="http://www.ebi.ac.uk/pride/archive/projects/PXD008134" />
</codes>
</xsl:variable>
<!--
gnps, jpost, massive, metabolome_express, paxdb: no id/OpenAIRE-entry found
ncbi: several OpenAIRE-entries found - is one the right?
-->
<xsl:key name="kCodeByName" match="code" use="string(@source)"/>
<xsl:template match="/">
<xsl:variable name="datasourcePrefix"
select="normalize-space(//oaf:datasourceprefix)" />
<xsl:call-template name="validRecord" />
</xsl:template>
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template name="validRecord">
<record>
<xsl:apply-templates select="//*[local-name() = 'header']" />
<metadata>
<!--
<xsl:apply-templates select="//*[local-name() = 'metadata']//*[local-name() = 'datasets']"/>
-->
<datacite:resource>
<!-- OmicsDI does not state: languages, projects,
-->
<!-- landing page -->
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
<datacite:alternateIdentifiers>
<datacite:alternateIdentifier>
<xsl:attribute name="alternateIdentifierType">
<xsl:value-of select="'LandingPage'"/>
</xsl:attribute>
<xsl:choose>
<xsl:when test="//*[local-name() = 'source'][. = ('gnps','massive','paxdb','peptide_atlas')]">
<xsl:value-of select="concat('https://www.omicsdi.org/#/dataset/', //*[local-name() = 'source'], '/', //*[local-name() = 'id'])"/>
</xsl:when>
<xsl:when test="//*[local-name() = 'source'][. = 'metabolome_express']">
<xsl:value-of select="concat(key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@urlTemplate, substring-after(//*[local-name()='id'], 'MEX'))"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="concat(key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@urlTemplate, //*[local-name()='id'])"/>
</xsl:otherwise>
</xsl:choose>
</datacite:alternateIdentifier>
<datacite:alternateIdentifier>
<xsl:attribute name="alternateIdentifierType">
<xsl:value-of select="'local'"/>
</xsl:attribute>
<xsl:value-of select="//*[local-name()='id']"/>
</datacite:alternateIdentifier>
</datacite:alternateIdentifiers>
</xsl:if>
<!-- identifier; ... -->
<!-- URL hindered by _et_: https://www.omicsdi.org/ws/dataset/get_acc=E-MTAB-6546_et_database=arrayexpress-repository -->
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
<datacite:identifier>
<xsl:attribute name="identifierType">
<xsl:value-of select="'URL'"/>
</xsl:attribute>
<xsl:value-of select="concat('https://www.omicsdi.org/dataset/', //*[local-name() = 'source'], '/', //*[local-name() = 'id'])"/>
</datacite:identifier>
</xsl:if>
<!-- title -->
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
<datacite:titles>
<datacite:title>
<xsl:value-of select="//*[local-name() = 'title']"/>
</datacite:title>
</datacite:titles>
</xsl:if>
<!-- no authors in OmicsDI -->
<!--
<xsl:call-template name="authors" />
-->
<!--
<xsl:call-template name="relatedPaper" />
-->
<datacite:descriptions>
<datacite:description>
<xsl:attribute name="descriptionType">
<xsl:value-of select="'Abstract'"/>
</xsl:attribute>
<xsl:value-of select="//*[local-name() = 'description']"/>
</datacite:description>
</datacite:descriptions>
<!-- subject -->
<datacite:subjects>
<xsl:for-each select="distinct-values(//*[local-name()='omicsType'])">
<datacite:subject>
<xsl:value-of select="."/>
</datacite:subject>
</xsl:for-each>
</datacite:subjects>
</datacite:resource>
<xsl:choose>
<xsl:when test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
<xsl:variable name='varCobjCategory'
select="'0021'" />
<dr:CobjCategory>
<xsl:attribute name="type">
<xsl:value-of select="TransformationFunction:convertString($tf, $varCobjCategory, 'SuperTypes')"/>
</xsl:attribute>
<xsl:value-of
select="$varCobjCategory" />
</dr:CobjCategory>
</xsl:when>
<xsl:otherwise>
<!--
<xsl:call-template name="terminate"/>
-->
</xsl:otherwise>
</xsl:choose>
<!--
// review status: no review indications found so far
-->
<!--
OMICSDI is including both open and controlled data source.
-->
<oaf:accessrights>
<xsl:text>UNKNOWN</xsl:text>
</oaf:accessrights>
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='NeuroVault__']">
<oaf:concept>
<xsl:attribute name="id">
<xsl:value-of select="'ni'"/>
</xsl:attribute>
</oaf:concept>
</xsl:if>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@name"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@id"/>
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:collectedFrom>
<!-- date -->
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
<oaf:dateAccepted>
<xsl:value-of select="replace(//*[local-name() = 'publicationDate'][not(.='null')],'(\d{4})(\d{2})(\d{2})','$1-$2-$3')"/>
</oaf:dateAccepted>
</xsl:if>
</metadata>
<xsl:copy-of select="//*[local-name() = 'about']" />
</record>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="//*[local-name() = 'metadata']//*[local-name() = 'datasets']">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="//*[local-name() = 'header']">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate"/>
</xsl:element>
</xsl:copy>
</xsl:template>
<!--
no authors findable in OmicsDI">
-->
<!--
<xsl:template match="//*[local-name() = 'authors']">
-->
<xsl:template name="authors">
<xsl:choose>
<xsl:when test="not(//*[local-name() = 'authors'][string-length(normalize-space(.)) > 0 and not(. = 'null')])">
<xsl:call-template name="terminate" />
</xsl:when>
<xsl:otherwise>
<xsl:for-each select="tokenize(//*[local-name() = 'authors'], '(, and |,| and )')">
<xsl:element name="datacite:creator">
<xsl:element name="datacite:creatorName">
<xsl:value-of select="TransformationFunction:convertString($tf, ., 'Person')"/>
</xsl:element>
<xsl:element name="datacite:givenName">
<xsl:value-of select="normalize-space(substring-after(TransformationFunction:convertString($tf, ., 'Person'), ','))"/>
</xsl:element>
<xsl:element name="datacite:familyName">
<xsl:value-of select="substring-before(TransformationFunction:convertString($tf, ., 'Person'), ',')"/>
</xsl:element>
</xsl:element>
</xsl:for-each>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<!--
<xsl:template match="//*[local-name() = 'DOI']">
-->
<xsl:template name="relatedPaper">
<xsl:element name="datacite:relatedIdentifier">
<xsl:attribute name="relatedIdentifierType">
<xsl:value-of select="'DOI'"/>
</xsl:attribute>
<xsl:attribute name="relationType">
<xsl:value-of select="'isReferencedBy'"/>
</xsl:attribute>
<xsl:value-of select="//*[local-name() = 'DOI']"/>
</xsl:element>
</xsl:template>
</xsl:stylesheet>