forked from D-Net/dnet-hadoop
Merge remote-tracking branch 'origin/stable_ids' into stable_ids
This commit is contained in:
commit
44a0064df6
|
@ -42,6 +42,8 @@ public class Constants {
|
|||
public static final String RETRY_DELAY = "retryDelay";
|
||||
public static final String CONNECT_TIMEOUT = "connectTimeOut";
|
||||
public static final String READ_TIMEOUT = "readTimeOut";
|
||||
public static final String FROM_DATE_OVERRIDE = "fromDateOverride";
|
||||
public static final String UNTIL_DATE_OVERRIDE = "untilDateOverride";
|
||||
|
||||
public static final String CONTENT_TOTALITEMS = "TotalItems";
|
||||
public static final String CONTENT_INVALIDRECORDS = "InvalidRecords";
|
||||
|
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.schema.oaf;
|
|||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
|
@ -22,6 +23,9 @@ public class CleaningFunctions {
|
|||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||
|
||||
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
||||
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
||||
public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
|
||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
|
||||
|
||||
static {
|
||||
PID_BLACKLIST.add("none");
|
||||
|
@ -80,6 +84,36 @@ public class CleaningFunctions {
|
|||
return value;
|
||||
}
|
||||
|
||||
public static <T extends Oaf> boolean filter(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to evaluate here
|
||||
} else if (value instanceof Project) {
|
||||
// nothing to evaluate here
|
||||
} else if (value instanceof Organization) {
|
||||
// nothing to evaluate here
|
||||
} else if (value instanceof Relation) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Result) {
|
||||
|
||||
Result r = (Result) value;
|
||||
|
||||
if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (value instanceof Publication) {
|
||||
|
||||
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
|
||||
|
||||
} else if (value instanceof OtherResearchProduct) {
|
||||
|
||||
} else if (value instanceof Software) {
|
||||
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T cleanup(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
|
@ -124,6 +158,12 @@ public class CleaningFunctions {
|
|||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.filter(
|
||||
sp -> sp
|
||||
.getValue()
|
||||
.toLowerCase()
|
||||
.replaceAll(TITLE_FILTER_REGEX, "")
|
||||
.length() > TITLE_FILTER_RESIDUAL_LENGTH)
|
||||
.map(CleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
@ -199,16 +239,7 @@ public class CleaningFunctions {
|
|||
}
|
||||
}
|
||||
if (Objects.nonNull(r.getAuthor())) {
|
||||
boolean nullRank = r
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.anyMatch(a -> Objects.isNull(a.getRank()));
|
||||
if (nullRank) {
|
||||
int i = 1;
|
||||
for (Author author : r.getAuthor()) {
|
||||
author.setRank(i++);
|
||||
}
|
||||
}
|
||||
final List<Author> authors = Lists.newArrayList();
|
||||
for (Author a : r.getAuthor()) {
|
||||
if (Objects.isNull(a.getPid())) {
|
||||
a.setPid(Lists.newArrayList());
|
||||
|
@ -235,7 +266,26 @@ public class CleaningFunctions {
|
|||
.stream()
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (StringUtils.isBlank(a.getFullname())) {
|
||||
if (StringUtils.isNotBlank(a.getName()) && StringUtils.isNotBlank(a.getSurname())) {
|
||||
a.setFullname(a.getSurname() + ", " + a.getName());
|
||||
}
|
||||
}
|
||||
if (StringUtils.isNotBlank(a.getFullname()) && isValidAuthorName(a)) {
|
||||
authors.add(a);
|
||||
}
|
||||
}
|
||||
|
||||
boolean nullRank = authors
|
||||
.stream()
|
||||
.anyMatch(a -> Objects.isNull(a.getRank()));
|
||||
if (nullRank) {
|
||||
int i = 1;
|
||||
for (Author author : authors) {
|
||||
author.setRank(i++);
|
||||
}
|
||||
}
|
||||
r.setAuthor(authors);
|
||||
|
||||
}
|
||||
if (value instanceof Publication) {
|
||||
|
@ -252,6 +302,15 @@ public class CleaningFunctions {
|
|||
return value;
|
||||
}
|
||||
|
||||
private static boolean isValidAuthorName(Author a) {
|
||||
return !Stream
|
||||
.of(a.getFullname(), a.getName(), a.getSurname())
|
||||
.filter(s -> s != null && !s.isEmpty())
|
||||
.collect(Collectors.joining(""))
|
||||
.toLowerCase()
|
||||
.matches(INVALID_AUTHOR_REGEX);
|
||||
}
|
||||
|
||||
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
|
||||
return pids
|
||||
.stream()
|
||||
|
|
|
@ -0,0 +1,206 @@
|
|||
|
||||
package eu.dnetlib.dhp.transformation.xslt;
|
||||
|
||||
import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
|
||||
|
||||
import java.io.Serializable;
|
||||
// import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Normalizer;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
import eu.dnetlib.dhp.transformation.xslt.utils.Capitalize;
|
||||
import eu.dnetlib.dhp.transformation.xslt.utils.DotAbbreviations;
|
||||
import net.sf.saxon.s9api.ExtensionFunction;
|
||||
import net.sf.saxon.s9api.ItemType;
|
||||
import net.sf.saxon.s9api.OccurrenceIndicator;
|
||||
import net.sf.saxon.s9api.QName;
|
||||
import net.sf.saxon.s9api.SaxonApiException;
|
||||
import net.sf.saxon.s9api.SequenceType;
|
||||
import net.sf.saxon.s9api.XdmValue;
|
||||
|
||||
//import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
//import eu.dnetlib.pace.util.Capitalise;
|
||||
//import eu.dnetlib.pace.util.DotAbbreviations;
|
||||
|
||||
public class PersonCleaner implements ExtensionFunction, Serializable {
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 1L;
|
||||
private List<String> firstname = Lists.newArrayList();
|
||||
private List<String> surname = Lists.newArrayList();
|
||||
private List<String> fullname = Lists.newArrayList();
|
||||
|
||||
private static Set<String> particles = null;
|
||||
|
||||
public PersonCleaner() {
|
||||
|
||||
}
|
||||
|
||||
public String normalize(String s) {
|
||||
s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
|
||||
s = s.replaceAll("\\(.+\\)", "");
|
||||
s = s.replaceAll("\\[.+\\]", "");
|
||||
s = s.replaceAll("\\{.+\\}", "");
|
||||
s = s.replaceAll("\\s+-\\s+", "-");
|
||||
|
||||
// s = s.replaceAll("[\\W&&[^,-]]", " ");
|
||||
|
||||
// System.out.println("class Person: s: " + s);
|
||||
|
||||
// s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", " ");
|
||||
s = s.replaceAll("[\\p{Punct}&&[^-,]]", " ");
|
||||
s = s.replace("\\d", " ");
|
||||
s = s.replace("\\n", " ");
|
||||
s = s.replace("\\.", " ");
|
||||
s = s.replaceAll("\\s+", " ");
|
||||
|
||||
if (s.contains(",")) {
|
||||
// System.out.println("class Person: s: " + s);
|
||||
|
||||
String[] arr = s.split(",");
|
||||
if (arr.length == 1) {
|
||||
|
||||
fullname = splitTerms(arr[0]);
|
||||
} else if (arr.length > 1) {
|
||||
surname = splitTerms(arr[0]);
|
||||
firstname = splitTermsFirstName(arr[1]);
|
||||
// System.out.println("class Person: surname: " + surname);
|
||||
// System.out.println("class Person: firstname: " + firstname);
|
||||
|
||||
fullname.addAll(surname);
|
||||
fullname.addAll(firstname);
|
||||
}
|
||||
} else {
|
||||
fullname = splitTerms(s);
|
||||
|
||||
int lastInitialPosition = fullname.size();
|
||||
boolean hasSurnameInUpperCase = false;
|
||||
|
||||
for (int i = 0; i < fullname.size(); i++) {
|
||||
String term = fullname.get(i);
|
||||
if (term.length() == 1) {
|
||||
lastInitialPosition = i;
|
||||
} else if (term.equals(term.toUpperCase())) {
|
||||
hasSurnameInUpperCase = true;
|
||||
}
|
||||
}
|
||||
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
|
||||
firstname = fullname.subList(0, lastInitialPosition + 1);
|
||||
System.out.println("name: " + firstname);
|
||||
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
|
||||
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
|
||||
for (String term : fullname) {
|
||||
if (term.length() > 1 && term.equals(term.toUpperCase())) {
|
||||
surname.add(term);
|
||||
} else {
|
||||
firstname.add(term);
|
||||
}
|
||||
}
|
||||
} else if (lastInitialPosition == fullname.size()) {
|
||||
surname = fullname.subList(lastInitialPosition - 1, fullname.size());
|
||||
firstname = fullname.subList(0, lastInitialPosition - 1);
|
||||
}
|
||||
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private List<String> splitTermsFirstName(String s) {
|
||||
List<String> list = Lists.newArrayList();
|
||||
for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
|
||||
if (s.trim().matches("\\p{Lu}{2,3}")) {
|
||||
String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase)
|
||||
for (String p : parts) {
|
||||
if (p.length() > 0)
|
||||
list.add(p);
|
||||
}
|
||||
} else {
|
||||
list.add(part);
|
||||
}
|
||||
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
private List<String> splitTerms(String s) {
|
||||
if (particles == null) {
|
||||
// particles = NGramUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
||||
}
|
||||
|
||||
List<String> list = Lists.newArrayList();
|
||||
for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
|
||||
// if (!particles.contains(part.toLowerCase())) {
|
||||
list.add(part);
|
||||
|
||||
// }
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public List<String> getFirstname() {
|
||||
return firstname;
|
||||
}
|
||||
|
||||
public List<String> getSurname() {
|
||||
return surname;
|
||||
}
|
||||
|
||||
public List<String> getFullname() {
|
||||
return fullname;
|
||||
}
|
||||
|
||||
public String hash() {
|
||||
return Hashing.murmur3_128().hashString(getNormalisedFullname(), StandardCharsets.UTF_8).toString();
|
||||
}
|
||||
|
||||
public String getNormalisedFullname() {
|
||||
return isAccurate() ? Joiner.on(" ").join(getSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations())
|
||||
: Joiner.on(" ").join(fullname);
|
||||
// return isAccurate() ?
|
||||
// Joiner.on(" ").join(getCapitalSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) :
|
||||
// Joiner.on(" ").join(fullname);
|
||||
}
|
||||
|
||||
public List<String> getCapitalSurname() {
|
||||
return Lists.newArrayList(Iterables.transform(surname, new Capitalize()));
|
||||
}
|
||||
|
||||
public List<String> getNameWithAbbreviations() {
|
||||
return Lists.newArrayList(Iterables.transform(firstname, new DotAbbreviations()));
|
||||
}
|
||||
|
||||
public boolean isAccurate() {
|
||||
return (firstname != null && surname != null && !firstname.isEmpty() && !surname.isEmpty());
|
||||
}
|
||||
|
||||
@Override
|
||||
public QName getName() {
|
||||
return new QName(QNAME_BASE_URI + "/person", "person");
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType() {
|
||||
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public XdmValue call(XdmValue[] arguments) throws SaxonApiException {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -46,6 +46,7 @@ public class XSLTTransformationFunction implements MapFunction<MetadataRecord, M
|
|||
Processor processor = new Processor(false);
|
||||
processor.registerExtensionFunction(cleanFunction);
|
||||
processor.registerExtensionFunction(new DateCleaner());
|
||||
processor.registerExtensionFunction(new PersonCleaner());
|
||||
|
||||
final XsltCompiler comp = processor.newXsltCompiler();
|
||||
XsltExecutable xslt = comp
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
|
||||
package eu.dnetlib.dhp.transformation.xslt.utils;
|
||||
|
||||
// import org.apache.commons.text.WordUtils;
|
||||
// import org.apache.commons.text.WordUtils;
|
||||
import com.google.common.base.Function;
|
||||
|
||||
public class Capitalize implements Function<String, String> {
|
||||
|
||||
@Override
|
||||
public String apply(String s) {
|
||||
return org.apache.commons.lang3.text.WordUtils.capitalize(s.toLowerCase());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
|
||||
package eu.dnetlib.dhp.transformation.xslt.utils;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
|
||||
public class DotAbbreviations implements Function<String, String> {
|
||||
|
||||
@Override
|
||||
public String apply(String s) {
|
||||
return s.length() == 1 ? s + "." : s;
|
||||
}
|
||||
}
|
|
@ -92,15 +92,19 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("Test Transform Inst.&Them.v4 record XML with xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid")
|
||||
public void testTransformITGv4() throws Exception {
|
||||
@DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite/oaiOpenAIRE")
|
||||
public void testTransformMostlyUsedScript() throws Exception {
|
||||
|
||||
String xslTransformationScript = "";
|
||||
xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl";
|
||||
xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl";
|
||||
|
||||
|
||||
// We Set the input Record getting the XML from the classpath
|
||||
final MetadataRecord mr = new MetadataRecord();
|
||||
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml")));
|
||||
// We Load the XSLT transformation Rule from the classpath
|
||||
XSLTTransformationFunction tr = loadTransformationRule(
|
||||
"/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl");
|
||||
XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript);
|
||||
|
||||
MetadataRecord result = tr.call(mr);
|
||||
|
||||
|
@ -110,15 +114,18 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite")
|
||||
public void testTransformMostlyUsedScript() throws Exception {
|
||||
@DisplayName("Test Transform record XML with xslt_cleaning_REST_OmicsDI")
|
||||
public void testTransformRestScript() throws Exception {
|
||||
|
||||
String xslTransformationScript = "";
|
||||
xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl";
|
||||
|
||||
|
||||
// We Set the input Record getting the XML from the classpath
|
||||
final MetadataRecord mr = new MetadataRecord();
|
||||
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml")));
|
||||
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_omicsdi.xml")));
|
||||
// We Load the XSLT transformation Rule from the classpath
|
||||
XSLTTransformationFunction tr = loadTransformationRule(
|
||||
"/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl");
|
||||
XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript);
|
||||
|
||||
MetadataRecord result = tr.call(mr);
|
||||
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<oai:record xmlns="http://namespace.openaire.eu/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<oai:header>
|
||||
<dri:objIdentifier>_____OmicsDI::0000337c02d1b51030675d69407655da</dri:objIdentifier>
|
||||
<dri:recordIdentifier>PRJNA78295</dri:recordIdentifier>
|
||||
<dri:dateOfCollection>2020-10-31T15:31:30.725Z</dri:dateOfCollection>
|
||||
<oaf:datasourceprefix>_____OmicsDI</oaf:datasourceprefix>
|
||||
</oai:header>
|
||||
<oai:metadata>
|
||||
<datasets xmlns="">
|
||||
<connectionsCountScaled>0.235294117647059</connectionsCountScaled>
|
||||
<reanalysisCount>0</reanalysisCount>
|
||||
<keywords>null</keywords>
|
||||
<citationsCountScaled>0.0</citationsCountScaled>
|
||||
<viewsCount>0</viewsCount>
|
||||
<description>Sedimentitalea nanhaiensis DSM 24252 Genome sequencing and assembly</description>
|
||||
<downloadCountScaled>8.20101314054644E-5</downloadCountScaled>
|
||||
<source>omics_ena_project</source>
|
||||
<title>Sedimentitalea nanhaiensis DSM 24252</title>
|
||||
<connectionsCount>14</connectionsCount>
|
||||
<citationsCount>0</citationsCount>
|
||||
<score>null</score>
|
||||
<omicsType>Genomics</omicsType>
|
||||
<reanalysisCountScaled>0.0</reanalysisCountScaled>
|
||||
<organisms>
|
||||
<acc>571166</acc>
|
||||
<name>Sedimentitalea nanhaiensis DSM 24252</name>
|
||||
</organisms>
|
||||
<viewsCountScaled>0.0</viewsCountScaled>
|
||||
<claimable>false</claimable>
|
||||
<id>PRJNA78295</id>
|
||||
<publicationDate>null</publicationDate>
|
||||
<downloadCount>13</downloadCount>
|
||||
</datasets>
|
||||
</oai:metadata>
|
||||
<about xmlns="">
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2020-10-31T15:31:30.725Z">
|
||||
<baseURL>https%3A%2F%2Fwww.omicsdi.org%2Fws%2Fdataset%2Fsearch</baseURL>
|
||||
<identifier/>
|
||||
<datestamp/>
|
||||
<metadataNamespace/>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
|
||||
classname="sysimport:crosswalk:datasetarchive"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</about>
|
||||
</oai:record>
|
|
@ -0,0 +1,297 @@
|
|||
<!-- complete literature v4: xslt_cleaning_REST_OmicsDI ; transformation script production , 2021-03-17 -->
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
version="2.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:datacite="http://datacite.org/schema/kernel-4"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
|
||||
xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
||||
xmlns:vocabulary="http://eu/dnetlib/transform/clean"
|
||||
xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO"
|
||||
xmlns:personCleaner="http://eu/dnetlib/transform/person"
|
||||
exclude-result-prefixes="xsl vocabulary dateCleaner personCleaner">
|
||||
|
||||
<xsl:param name="varOfficialName" />
|
||||
<xsl:param name="varDsType" />
|
||||
<xsl:param name="varDataSourceId" />
|
||||
<xsl:param name="index" select="0"/>
|
||||
<xsl:param name="transDate" select="current-dateTime()"/>
|
||||
|
||||
<xsl:variable name="vCodes">
|
||||
<codes>
|
||||
<code source="arrayexpress-repository" id="re3data_____::r3d100010222" name="ArrayExpress Archive of Functional Genomics Data" sourceUrl="https://www.ebi.ac.uk/arrayexpress/" urlTemplate="https://www.ebi.ac.uk/arrayexpress/experiments/" />
|
||||
<code source="atlas-experiments" id="re3data_____::r3d100010223" name="Expression Atlas Database" sourceUrl="http://www.ebi.ac.uk/gxa/home" urlTemplate="" />
|
||||
<code source="biomodels" id="re3data_____::r3d100010789" name="BioModels Database" sourceUrl="https://www.ebi.ac.uk/biomodels-main/" urlTemplate="" />
|
||||
<code source="dbgap" id="re3data_____::r3d100010788" name="dbGaP (database of Genotypes and Phenotypes)" sourceUrl="https://www.ncbi.nlm.nih.gov/gap" urlTemplate="" />
|
||||
<code source="ega" id="re3data_____::r3d100011242" name="EGA Database (European Genome-phenome Archive)" sourceUrl="https://ega-archive.org" urlTemplate="" />
|
||||
<code source="eva" id="re3data_____::r3d100011553" name="EVA database (European Variation Archive)" sourceUrl="https://www.ebi.ac.uk/eva/" urlTemplate="" />
|
||||
<code source="geo" id="re3data_____::r3d100010283" name="GEO (Gene Expression Omnibus)" sourceUrl="https://www.ncbi.nlm.nih.gov/geo/" urlTemplate="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" />
|
||||
<code source="gnps" id="omicsdi_____::gnps" name="GNPS Database (Global Natural Products Social Molecular Networking)" sourceUrl="https://gnps.ucsd.edu/ProteoSAFe/static/gnps-splash2.jsp" urlTemplate="" />
|
||||
<code source="gpmdb" id="re3data_____::r3d100010883" name="GPMDB (Global Proteome Machine)" sourceUrl="http://gpmdb.thegpm.org/" urlTemplate="http://gpmdb.thegpm.org/~/dblist_gpmnum/gpmnum=" />
|
||||
<code source="jpost" id="re3data_____::r3d100012349" name="JPOST Repository (Japan ProteOme STandard Repository/Database)" sourceUrl="https://jpostdb.org/" urlTemplate="https://repository.jpostdb.org/entry/JPST000228" />
|
||||
<code source="lincs" id="re3data_____::r3d100011833" name="LINCS (Big Data to Knowledge / Library of Integrated Network-based Cellular Signatures)" sourceUrl="http://lincsportal.ccs.miami.edu/dcic-portal/" urlTemplate="http://lincsportal.ccs.miami.edu/datasets/#/view/" />
|
||||
<code source="massive" id="omicsdi_____::massive" name="MassIVE Database (Mass Spectrometry Interactive Virtual Environment)" sourceUrl="https://massive.ucsd.edu/ProteoSAFe/datasets.jsp" urlTemplate="" />
|
||||
<code source="metabolights_dataset" id="opendoar____::2970" name="MetaboLights Database" sourceUrl="http://www.ebi.ac.uk/metabolights/" urlTemplate="" />
|
||||
<code source="metabolome_express" id="omicsdi_____::metabolome" name="MetabolomeExpress" sourceUrl="https://www.metabolome-express.org/" urlTemplate="https://www.metabolome-express.org/datasetview.php?datasetid=" />
|
||||
<code source="metabolomics_workbench" id="re3data_____::r3d100012314" name="Metabolomics Workbench Database" sourceUrl="http://www.metabolomicsworkbench.org/" urlTemplate="http://www.metabolomicsworkbench.org/data/DRCCMetadata.php?StudyID=" />
|
||||
<code source="NCBI" id="omicsdi_____::ncbi" name="NCBI" sourceUrl="https://www.ncbi.nlm.nih.gov/bioproject/" urlTemplate="https://www.ncbi.nlm.nih.gov/bioproject/" />
|
||||
<code source="omics_ena_project" id="re3data_____::r3d100010527" name="ENA (European Nucleotide Archive)" sourceUrl="https://www.ebi.ac.uk/ena" urlTemplate="https://www.ebi.ac.uk/ena/data/view/" />
|
||||
<code source="paxdb" id="omicsdi_____::paxdb" name="PAXDB (protein abundance database)" sourceUrl="http://pax-db.org/" urlTemplate="" />
|
||||
<code source="peptide_atlas" id="re3data_____::r3d100010889" name="PeptideAtlas Database" sourceUrl="http://www.peptideatlas.org/" urlTemplate="" />
|
||||
<code source="pride" id="re3data_____::r3d100010137" name="PRIDE Database (PRoteomics IDEntifications)" sourceUrl="http://www.ebi.ac.uk/pride/archive/" urlTemplate="http://www.ebi.ac.uk/pride/archive/projects/PXD008134" />
|
||||
</codes>
|
||||
</xsl:variable>
|
||||
|
||||
<!--
|
||||
gnps, jpost, massive, metabolome_express, paxdb: no id/OpenAIRE-entry found
|
||||
ncbi: several OpenAIRE-entries found - is one the right?
|
||||
-->
|
||||
|
||||
<xsl:key name="kCodeByName" match="code" use="string(@source)"/>
|
||||
|
||||
<xsl:template match="/">
|
||||
<xsl:variable name="datasourcePrefix"
|
||||
select="normalize-space(//oaf:datasourceprefix)" />
|
||||
<xsl:call-template name="validRecord" />
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="terminate">
|
||||
<xsl:message terminate="yes">
|
||||
record is not compliant, transformation is interrupted.
|
||||
</xsl:message>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="validRecord">
|
||||
<record>
|
||||
<xsl:apply-templates select="//*[local-name() = 'header']" />
|
||||
|
||||
<metadata>
|
||||
|
||||
<!--
|
||||
<xsl:apply-templates select="//*[local-name() = 'metadata']//*[local-name() = 'datasets']"/>
|
||||
-->
|
||||
|
||||
<datacite:resource>
|
||||
|
||||
<!-- OmicsDI does not state: languages, projects,
|
||||
-->
|
||||
|
||||
<!-- landing page -->
|
||||
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
|
||||
<datacite:alternateIdentifiers>
|
||||
<datacite:alternateIdentifier>
|
||||
<xsl:attribute name="alternateIdentifierType">
|
||||
<xsl:value-of select="'LandingPage'"/>
|
||||
</xsl:attribute>
|
||||
<xsl:choose>
|
||||
<xsl:when test="//*[local-name() = 'source'][. = ('gnps','massive','paxdb','peptide_atlas')]">
|
||||
<xsl:value-of select="concat('https://www.omicsdi.org/#/dataset/', //*[local-name() = 'source'], '/', //*[local-name() = 'id'])"/>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = 'source'][. = 'metabolome_express']">
|
||||
<xsl:value-of select="concat(key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@urlTemplate, substring-after(//*[local-name()='id'], 'MEX'))"/>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="concat(key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@urlTemplate, //*[local-name()='id'])"/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</datacite:alternateIdentifier>
|
||||
<datacite:alternateIdentifier>
|
||||
<xsl:attribute name="alternateIdentifierType">
|
||||
<xsl:value-of select="'local'"/>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="//*[local-name()='id']"/>
|
||||
</datacite:alternateIdentifier>
|
||||
</datacite:alternateIdentifiers>
|
||||
</xsl:if>
|
||||
|
||||
<!-- identifier; ... -->
|
||||
<!-- URL hindered by _et_: https://www.omicsdi.org/ws/dataset/get_acc=E-MTAB-6546_et_database=arrayexpress-repository -->
|
||||
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
|
||||
<datacite:identifier>
|
||||
<xsl:attribute name="identifierType">
|
||||
<xsl:value-of select="'URL'"/>
|
||||
</xsl:attribute>
|
||||
|
||||
<xsl:value-of select="concat('https://www.omicsdi.org/dataset/', //*[local-name() = 'source'], '/', //*[local-name() = 'id'])"/>
|
||||
|
||||
</datacite:identifier>
|
||||
</xsl:if>
|
||||
|
||||
<!-- title -->
|
||||
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
|
||||
<datacite:titles>
|
||||
<datacite:title>
|
||||
<xsl:value-of select="//*[local-name() = 'title']"/>
|
||||
</datacite:title>
|
||||
</datacite:titles>
|
||||
</xsl:if>
|
||||
|
||||
<!-- no authors in OmicsDI -->
|
||||
<!--
|
||||
<xsl:call-template name="authors" />
|
||||
-->
|
||||
|
||||
<!--
|
||||
<xsl:call-template name="relatedPaper" />
|
||||
-->
|
||||
|
||||
<datacite:descriptions>
|
||||
<datacite:description>
|
||||
<xsl:attribute name="descriptionType">
|
||||
<xsl:value-of select="'Abstract'"/>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="//*[local-name() = 'description']"/>
|
||||
</datacite:description>
|
||||
</datacite:descriptions>
|
||||
|
||||
<!-- subject -->
|
||||
<datacite:subjects>
|
||||
<xsl:for-each select="distinct-values(//*[local-name()='omicsType'])">
|
||||
<datacite:subject>
|
||||
<xsl:value-of select="."/>
|
||||
</datacite:subject>
|
||||
</xsl:for-each>
|
||||
</datacite:subjects>
|
||||
|
||||
</datacite:resource>
|
||||
|
||||
<xsl:choose>
|
||||
|
||||
<xsl:when test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
|
||||
<xsl:variable name='varCobjCategory'
|
||||
select="'0021'" />
|
||||
<dr:CobjCategory>
|
||||
<xsl:attribute name="type">
|
||||
<xsl:value-of select="vocabulary:clean($varCobjCategory, 'dnet:result_typologies')"/>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of
|
||||
select="$varCobjCategory" />
|
||||
</dr:CobjCategory>
|
||||
</xsl:when>
|
||||
|
||||
<xsl:otherwise>
|
||||
<!--
|
||||
<xsl:call-template name="terminate"/>
|
||||
-->
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
<!--
|
||||
// review status: no review indications found so far
|
||||
-->
|
||||
<!--
|
||||
OMICSDI is ‘including both open and controlled data source’.
|
||||
-->
|
||||
<oaf:accessrights>
|
||||
<xsl:text>UNKNOWN</xsl:text>
|
||||
</oaf:accessrights>
|
||||
|
||||
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='NeuroVault__']">
|
||||
<oaf:concept>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="'ni'"/>
|
||||
</xsl:attribute>
|
||||
</oaf:concept>
|
||||
</xsl:if>
|
||||
|
||||
<oaf:hostedBy>
|
||||
<xsl:attribute name="name">
|
||||
<xsl:value-of select="key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@name"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@id"/>
|
||||
</xsl:attribute>
|
||||
</oaf:hostedBy>
|
||||
<oaf:collectedFrom>
|
||||
<xsl:attribute name="name">
|
||||
<xsl:value-of select="$varOfficialName"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="$varDataSourceId"/>
|
||||
</xsl:attribute>
|
||||
</oaf:collectedFrom>
|
||||
|
||||
<!-- date -->
|
||||
<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
|
||||
<oaf:dateAccepted>
|
||||
<xsl:value-of select="replace(//*[local-name() = 'publicationDate'][not(.='null')],'(\d{4})(\d{2})(\d{2})','$1-$2-$3')"/>
|
||||
</oaf:dateAccepted>
|
||||
</xsl:if>
|
||||
|
||||
</metadata>
|
||||
<xsl:copy-of select="//*[local-name() = 'about']" />
|
||||
</record>
|
||||
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="node()|@*">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//*[local-name() = 'metadata']//*[local-name() = 'datasets']">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//*[local-name() = 'header']">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*"/>
|
||||
<xsl:element name="dr:dateOfTransformation">
|
||||
<xsl:value-of select="$transDate"/>
|
||||
</xsl:element>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
<!--
|
||||
no authors findable in OmicsDI">
|
||||
-->
|
||||
<!--
|
||||
<xsl:template match="//*[local-name() = 'authors']">
|
||||
-->
|
||||
<xsl:template name="authors">
|
||||
<xsl:choose>
|
||||
<xsl:when test="not(//*[local-name() = 'authors'][string-length(normalize-space(.)) > 0 and not(. = 'null')])">
|
||||
<xsl:call-template name="terminate" />
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:for-each select="tokenize(//*[local-name() = 'authors'], '(, and |,| and )')">
|
||||
|
||||
<xsl:element name="datacite:creator">
|
||||
|
||||
<xsl:element name="datacite:creatorName">
|
||||
<xsl:value-of select="personCleaner:normalize( .)"/>
|
||||
</xsl:element>
|
||||
|
||||
<xsl:element name="datacite:givenName">
|
||||
<xsl:value-of select="normalize-space(substring-after(personCleaner:normalize(.), ','))"/>
|
||||
</xsl:element>
|
||||
<xsl:element name="datacite:familyName">
|
||||
<xsl:value-of select="substring-before(personCleaner:normalize(.), ',')"/>
|
||||
</xsl:element>
|
||||
|
||||
</xsl:element>
|
||||
</xsl:for-each>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:template>
|
||||
|
||||
<!--
|
||||
<xsl:template match="//*[local-name() = 'DOI']">
|
||||
-->
|
||||
<xsl:template name="relatedPaper">
|
||||
<xsl:element name="datacite:relatedIdentifier">
|
||||
<xsl:attribute name="relatedIdentifierType">
|
||||
<xsl:value-of select="'DOI'"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="relationType">
|
||||
<xsl:value-of select="'isReferencedBy'"/>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="//*[local-name() = 'DOI']"/>
|
||||
</xsl:element>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
|
@ -90,6 +90,7 @@ public class CleanGraphSparkJob {
|
|||
.map((MapFunction<T, T>) value -> fixVocabularyNames(value), Encoders.bean(clazz))
|
||||
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
||||
.map((MapFunction<T, T>) value -> cleanup(value), Encoders.bean(clazz))
|
||||
.filter((FilterFunction<T>) value -> filter(value))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
|
|
|
@ -67,6 +67,7 @@ public class CleaningFunctionTest {
|
|||
|
||||
assertNotNull(p_out.getPublisher());
|
||||
assertNull(p_out.getPublisher().getValue());
|
||||
|
||||
assertEquals("und", p_out.getLanguage().getClassid());
|
||||
assertEquals("Undetermined", p_out.getLanguage().getClassname());
|
||||
|
||||
|
@ -120,6 +121,9 @@ public class CleaningFunctionTest {
|
|||
.isPresent());
|
||||
|
||||
Publication p_cleaned = CleaningFunctions.cleanup(p_out);
|
||||
|
||||
assertEquals(1, p_cleaned.getTitle().size());
|
||||
|
||||
assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
|
||||
assertNull(p_out.getPublisher());
|
||||
|
||||
|
|
|
@ -865,6 +865,28 @@
|
|||
"schemename": "dnet:dataCite_title"
|
||||
},
|
||||
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "main title",
|
||||
"classname": "main title",
|
||||
"schemeid": "dnet:dataCite_title",
|
||||
"schemename": "dnet:dataCite_title"
|
||||
},
|
||||
"value": "test test 123 test"
|
||||
}
|
||||
]
|
||||
}
|
Loading…
Reference in New Issue