merged stable ids

This commit is contained in:
Sandro La Bruzzo 2021-07-12 17:00:43 +02:00
commit bbe8193930
36 changed files with 2577 additions and 198 deletions

View File

@ -131,18 +131,9 @@ public class HttpConnector2 {
}
return attemptDownload(newUrl, retryNumber + 1, report);
}
if (is4xx(urlConn.getResponseCode())) {
// CLIENT ERROR, DO NOT RETRY
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String
.format(
"%s error: %s", requestUrl, urlConn.getResponseMessage()));
throw new CollectorException("4xx error: request will not be repeated. " + report);
}
if (is5xx(urlConn.getResponseCode())) {
if (is4xx(urlConn.getResponseCode()) || is5xx(urlConn.getResponseCode())) {
switch (urlConn.getResponseCode()) {
case HttpURLConnection.HTTP_NOT_FOUND:
case HttpURLConnection.HTTP_BAD_GATEWAY:
case HttpURLConnection.HTTP_UNAVAILABLE:
case HttpURLConnection.HTTP_GATEWAY_TIMEOUT:

View File

@ -21,6 +21,9 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
public class OaiCollectorPlugin implements CollectorPlugin {
public static final String DATE_REGEX = "\\d{4}-\\d{2}-\\d{2}";
public static final String UTC_DATETIME_REGEX = "\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z";
private static final String FORMAT_PARAM = "format";
private static final String OAI_SET_PARAM = "set";
private static final Object OAI_FROM_DATE_PARAM = "fromDate";
@ -62,11 +65,11 @@ public class OaiCollectorPlugin implements CollectorPlugin {
throw new CollectorException("Param 'mdFormat' is null or empty");
}
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
if (fromDate != null && !fromDate.matches(DATE_REGEX) && !fromDate.matches(UTC_DATETIME_REGEX)) {
throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
}
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
if (untilDate != null && !untilDate.matches(DATE_REGEX) && !untilDate.matches(UTC_DATETIME_REGEX)) {
throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
}

View File

@ -107,10 +107,12 @@ public class OaiIterator implements Iterator<String> {
if (set != null && !set.isEmpty()) {
url += "&set=" + URLEncoder.encode(set, "UTF-8");
}
if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
if (fromDate != null && (fromDate.matches(OaiCollectorPlugin.DATE_REGEX)
|| fromDate.matches(OaiCollectorPlugin.UTC_DATETIME_REGEX))) {
url += "&from=" + URLEncoder.encode(fromDate, "UTF-8");
}
if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
if (untilDate != null && (untilDate.matches(OaiCollectorPlugin.DATE_REGEX)
|| untilDate.matches(OaiCollectorPlugin.UTC_DATETIME_REGEX))) {
url += "&until=" + URLEncoder.encode(untilDate, "UTF-8");
}
log.info("Start harvesting using url: " + url);

View File

@ -0,0 +1,143 @@
// from PROD 2021-07-06 , tf script of HAL with around 3mill. records
declare_script "dc_cleaning_OpenAIREplus_compliant_hal";
declare_ns oaf = "http://namespace.openaire.eu/oaf";
declare_ns dri = "http://www.driver-repository.eu/namespace/dri";
declare_ns dr = "http://www.driver-repository.eu/namespace/dr";
declare_ns dc = "http://purl.org/dc/elements/1.1/";
declare_ns prov = "http://www.openarchives.org/OAI/2.0/provenance";
declare_ns oai = "http://www.openarchives.org/OAI/2.0/";
declare_ns xs = "http://www.w3.org/2001/XMLSchema";
$var0 = "''";
$varFP7 = "'corda_______::'";
$varH2020 = "'corda__h2020::'";
$varDummy = "''";
static $varDatasourceid = getValue(PROFILEFIELD, [xpath:"concat('collection(&amp;apos;/db/DRIVER/RepositoryServiceResources&amp;apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&amp;quot;NamespacePrefix&amp;quot;][value=&amp;quot;', //oaf:datasourceprefix, '&amp;quot;]]')", xpath:"//EXTRA_FIELDS/FIELD[key='OpenAireDataSourceId']/value"]);
static $varRepoid = xpath:"//dri:repositoryId";
static $varOfficialname = getValue(PROFILEFIELD, [xpath:"concat('collection(&amp;apos;/db/DRIVER/RepositoryServiceResources&amp;apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&amp;quot;NamespacePrefix&amp;quot;][value=&amp;quot;', //oaf:datasourceprefix, '&amp;quot;]]')", xpath:"//CONFIGURATION/OFFICIAL_NAME"]);
dri:objIdentifier = xpath:"//dri:objIdentifier";
dri:repositoryId = $varRepoid;
dri:recordIdentifier = xpath:"//dri:recordIdentifier";
//
// communities - deactivated until received green light from DARIAH to mark community on prod also
// $varCommunity = xpath:"//*[local-name()='setSpec'][starts-with(., 'collection:DARIAH')]/'dariah'";
// concept should not appear with empty attribute id, i.e when there is no community - ugly, but seems to work (oaf:datasourceprefix = just any field available in all records)
// oaf:concept = set(xpath:"//oaf:datasourceprefix[string-length($varCommunity) gt 0]/''", @id = $varCommunity;);
//
// apply xpath:"//dc:contributor[starts-with(., 'European Project')]" if xpath:"string-length(replace(., '.*(\d{6,6}).*', '$1')) = 6" oaf:projectid = xpath:"concat($var1, replace(., '.*(\d{6,6}).*', '$1'))"; else $varDummy = "''";
apply xpath:"//dc:creator" if xpath:"string-length(.) &gt; 0 and normalize-space(.) != ','" dc:creator = xpath:"normalize-space(.)"; else $varDummy = "''";
if xpath:"//dc:title[string-length(.)&gt; 0]" $varDummy = "''"; else dc:coverage = skipRecord();
dc:title = xpath:"//dc:title[string-length(.) &gt; 0]/normalize-space(.)";
apply xpath:"//dc:subject" if xpath:"string-length(.) &gt; 0" dc:subject = xpath:"normalize-space(.)"; else $varDummy = "''";
apply xpath:"//dc:publisher" if xpath:"string-length(.) &gt; 0" dc:publisher = xpath:"normalize-space(.)"; else $varDummy = "''";
apply xpath:"//dc:source" if xpath:"string-length(.) &gt; 0" dc:source = xpath:"normalize-space(.)"; else $varDummy = "''";
dc:contributor = xpath:"//dc:contributor";
// dc:description = xpath:"//dc:description/normalize-space(.)";
//dc:description = xpath:"string-join(//dc:description/normalize-space(.), concat('; ',codepoints-to-string(10)))";
dc:description = xpath:"string-join(//dc:description/normalize-space(.), '; ')";
dc:format = xpath:"//dc:format";
$varHttpTest = "''";
oaf:fulltext = xpath:"//dc:identifier[starts-with(., 'http') and (ends-with(., 'document') or ends-with(., 'pdf'))]";
//if xpath:"//dc:identifier[starts-with(., 'http') and (ends-with(., 'document') or ends-with(., 'pdf'))] or //dc:relation[starts-with(lower-case(normalize-space(.)), 'info:eu-repo/grantagreement')] or //dc:rights[starts-with(lower-case(normalize-space(.)), 'open') or contains(lower-case(normalize-space(.)), 'openaccess')] or //dc:accessRights[contains(lower-case(normalize-space(.)), 'openaccess')]" $var0 = "''"; else dc:coverage = skipRecord();
if xpath:"//dc:identifier[starts-with(., 'http')]" $var0 = "''"; else dc:coverage = skipRecord();
apply xpath:"//dc:identifier" if xpath:"starts-with(normalize-space(.), 'http')" dc:identifier = xpath:"normalize-space(.)"; else dr:CobjIdentifier = xpath:"normalize-space(.)";
dr:dateOfCollection = xpath:"//dri:dateOfCollection";
static dr:dateOfTransformation = xpath:"current-dateTime()";
dc:type = xpath:"//dc:type";
dc:format = xpath:"//dc:format";
dc:date = xpath:"//dc:date";
dc:language = Convert(xpath:"//dc:language", Languages);
$varDateAccepted = Convert(xpath:"descendant-or-self::dc:date", DateISO8601, "yyyy-MM-dd", "min()");
if xpath:"starts-with($varDateAccepted, '0')" oaf:dateAccepted = $varDummy; else oaf:dateAccepted = $varDateAccepted;
$varEmbargoEnd = xpath:"//dc:date[matches(normalize-space(.), '(.*)(info:eu-repo/date/embargoEnd/)(\d\d\d\d-\d\d-\d\d)', 'i')][contains(lower-case(.), 'info:eu-repo')]/replace(normalize-space(.), '(.*)(info:eu-repo/date/embargoEnd/)(\d\d\d\d-\d\d-\d\d)', '$3', 'i')";
oaf:embargoenddate = $varEmbargoEnd;
// FP7
oaf:projectid = xpath:"distinct-values(//dc:relation[matches(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/fp7/)(\d\d\d\d\d\d)(.*)', 'i')][year-from-date(xs:date(max(($varDateAccepted, '0001-01-01')))) gt 2006][contains(lower-case(.), 'info:eu-repo')]/concat($varFP7, replace(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/fp7/)(\d\d\d\d\d\d)(.*)', '$3', 'i')))";
// H2020
oaf:projectid = xpath:"distinct-values(//dc:relation[matches(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/h2020/)(\d\d\d\d\d\d)(.*)', 'i')][year-from-date(xs:date(max(($varDateAccepted, '0001-01-01')))) gt 2012][contains(lower-case(.), 'info:eu-repo')]/concat($varH2020, replace(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/h2020/)(\d\d\d\d\d\d)(.*)', '$3', 'i')))";
// H2020 workaround for HAL
oaf:projectid = xpath:"distinct-values(//dc:relation[matches(normalize-space(.), '(.*)(info:eu-repo/grantagreement//)(\d\d\d\d\d\d)(.*)', 'i')][//dc:contributor[contains(lower-case(.), 'h2020')]][year-from-date(xs:date(max(($varDateAccepted, '0001-01-01')))) gt 2012][contains(lower-case(.), 'info:eu-repo')]/concat($varH2020, replace(normalize-space(.), '(.*)(info:eu-repo/grantagreement//)(\d\d\d\d\d\d)(.*)', '$3', 'i')))";
dc:relation = xpath:"//dc:relation";
//comment-js-09-10-2012 apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics')" dc:rights = empty; else dc:rights = xpath:"normalize-space(.)";
//
oaf:collectedDatasourceid = xpath:"$varDatasourceid";
//
//if xpath:"//dc:type[1]/lower-case(.) = 'text'" dr:CobjCategory = Convert(xpath:"reverse(//dc:type) | //oai:setSpec", TextTypologies); else dr:CobjCategory = Convert(xpath:"//dc:type | //oai:setSpec", TextTypologies);
$varCobjCategoryReverse = Convert(xpath:"insert-before(reverse(//dc:type) , 0, reverse(//oai:setSpec))", TextTypologies);
$varSuperTypeReverse = Convert(xpath:"normalize-space($varCobjCategoryReverse)", SuperTypes);
dr:CobjCategory = set(xpath:"//oaf:datasourceprefix[//dc:type[1]/lower-case(.) = ('text', 'info:eu-repo/semantics/other', 'other') or //oaf:datasourceprefix/lower-case(.) = 'openedition_']/$varCobjCategoryReverse", @type = $varSuperTypeReverse;);
$varCobjCategoryStraight = Convert(xpath:"insert-before(//dc:type , 100, //oai:setSpec)", TextTypologies);
$varSuperTypeStraight = Convert(xpath:"normalize-space($varCobjCategoryStraight)", SuperTypes);
dr:CobjCategory = set(xpath:"//oaf:datasourceprefix[not(//dc:type[1]/lower-case(.) = ('text', 'info:eu-repo/semantics/other', 'other'))]/$varCobjCategoryStraight", @type = $varSuperTypeStraight;);
//
// review level
// oaf:refereed = Convert(xpath:"//dc:description", ReviewLevels);
$varRefereedConvt = Convert(xpath:"(//dc:type, //oai:setSpec, //dc:description)", ReviewLevels);
$varRefereedDesct = xpath:"(//dc:description[matches(lower-case(.), '.*(this\s*book|it)\s*constitutes\s*the\s*(thoroughly\s*)?refereed') or matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*')]/'0001', //dc:description[matches(., '^version\s*(préliminaire.*|0$)')]/'0002')";
$varRefereedIdntf = xpath:"(//*[string(node-name(.)) = 'dc:identifier' and matches(lower-case(.), '(^|.*[\.\-_/\s\(\)])pre[\.\-_/\s\(\)]?prints?([\.\-_/\s\(\)].*)?$')][count(//dc:identifier) = 1]/'0002', //*[string(node-name(.)) = 'dc:identifier' and matches(lower-case(.), '(^|.*[\.\-_/\s\(\)])refereed([\.\-_/\s\(\)\d].*)?$')]/'0001', //*[string(node-name(.)) = 'dc:identifier' and contains(lower-case(.), '-peer-reviewed-article-')]/'0001')";
$varRefereed = xpath:"($varRefereedConvt, $varRefereedIdntf, $varRefereedDesct)";
if xpath:"count(index-of($varRefereed, '0001')) &gt;0" oaf:refereed = xpath:"'0001'"; else $varDummy= "''";
if xpath:"count(index-of($varRefereed, '0002')) &gt;0 and count(index-of($varRefereed, '0001')) = 0" oaf:refereed = xpath:"'0002'"; else $varDummy= "''";
//
apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics') and (xs:date( max( ($varEmbargoEnd, '0001-01-01') ) ) gt current-date())" oaf:accessrights = Convert(xpath:"normalize-space(.)", AccessRights); else dc:rights = xpath:".";
// apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics') " oaf:accessrights = Convert(xpath:"normalize-space(.)", AccessRights); else dc:rights = xpath:".";
//2021-06-01 ; acz ; next line to avoid to be OPEN as default, set to UNKNOWN , 2021-07-05 acz
//if xpath:"//dc:rights[starts-with(normalize-space(.), 'info:eu-repo/semantics') and not(xs:date( max( ($varEmbargoEnd, '0001-01-01') ) ) lt current-date())]" $var0 = "''"; else oaf:accessrights = "UNKNOWN";
oaf:license = xpath:"//dc:rights[starts-with(., 'http') or matches(., '^CC[- ]BY([- ](NC([- ](ND|SA))?|ND|SA))([- ]\d(\.\d)?)?$', 'i')]";
//
static oaf:collectedFrom = set("''", @name = $varOfficialname; , @id = $varDatasourceid;);
static oaf:hostedBy = set("''", @name = $varOfficialname; , @id = $varDatasourceid;);
//
//$varId = identifierExtract('["//dc:identifier", "//dc:relation"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/&lt;&gt;]*/[^\s"&lt;&gt;]+)');
$varIdDoi = identifierExtract('["//dc:identifier[starts-with(., \"info:\") or starts-with(., \"urn:\") or starts-with(., \"doi:\") or starts-with(., \"DOI:\") or starts-with(., \"Doi:\") or starts-with(., \"doi \") or starts-with(., \"DOI \") or starts-with(., \"Doi \") or starts-with(., \"10.\") or ((starts-with(., \"http\")) and contains(., \"doi.org/10.\"))]", "//dc:relation[starts-with(., \"info:eu-repo/semantics/altIdentifier/doi/10.\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/DOI/10.\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/Doi/10.\") or ((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/doi/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/DOI/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/Doi/http\")) and contains(., \"doi.org/10.\"))]"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/&lt;&gt;]*/[^\s"&lt;&gt;]+)');
$varIdHdl = identifierExtract('["//dc:identifier[starts-with(., \"HDL:\") and not(starts-with(., \"HDL: http\"))][not(contains(., \"123456789\"))]", "//dc:relation[starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/hdl/\") or (starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/url/\") and contains(., \"://hdl.handle.net/\"))]"]' , xpath:"./*[local-name()='record']" , '(?!(info:hdl:|://hdl.handle.net/|info:eu-repo/semantics/altIdentifier/hdl/))(\d.*)');
$varIdIsbn = xpath:"(//dc:identifier, //dc:source)[starts-with(lower-case(.), 'isbn') or starts-with(., '978') or starts-with(., '979')][(matches(., '(isbn[:\s]*)?97[89]-\d+-\d+-\d+-\d+$', 'i') and string-length(concat('97', substring-after(., '97'))) = 17) or matches(., '(isbn[:\s]*)?97[89]\d{10}$', 'i')]/replace(., 'isbn[:\s]*', '', 'i'), //dc:relation[starts-with(lower-case(.), 'info:eu-repo/semantics/altidentifier/isbn/')][(matches(., 'info:eu-repo/semantics/altIdentifier/isbn/97[89]-\d+-\d+-\d+-\d+$', 'i') and string-length(.) = 59) or matches(., 'info:eu-repo/semantics/altidentifier/isbn/97[89]\d{10}$', 'i')]/substring-after(lower-case(.), 'info:eu-repo/semantics/altidentifier/isbn/')";
$varIdBibc = identifierExtract('["//dc:identifier[starts-with(., \"BibCode:\") or starts-with(., \"BIBCODE:\") or (starts-with(., \"http:\") and contains(., \"bibcode=\"))]"]' , xpath:"./*[local-name()='record']" , '(^(BibCode:|BIBCODE:|http).*$)');
$varIdPtnt = identifierExtract('["//dc:identifier[starts-with(., \"Patent N°:\")]"]' , xpath:"./*[local-name()='record']" , '(^Patent N°:.*$)');
$varPmId = identifierExtract('["//dc:identifier[starts-with(normalize-space(.), \"PUBMED:\")]"]' , xpath:"./*[local-name()='record']" , '(?!PUBMED: )(\d+)');
$varIdPmc = identifierExtract('["//dc:identifier[starts-with(., \"PUBMEDCENTRAL:\") or (starts-with(., \"http\") and contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/PMC\"))]", "//dc:relation[starts-with(., \"info:eu-repo/semantics/altIdentifier/pmid/PMC\") or ((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/pmid/http\")) and contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/PMC\"))]"]' , xpath:"./*[local-name()='record']" , '(PMC\d+)');
//$varIdHal = identifierExtract('["//dc:identifier[starts-with(., \"ads-\") or starts-with(., \"anses-\") or starts-with(., \"artxibo-\") or starts-with(., \"bioemco-\") or starts-with(., \"cea-\") or starts-with(., \"cel-\") or starts-with(., \"cirad-\") or starts-with(., \"edutice-\") or starts-with(., \"emse-\") or starts-with(., \"EMSE-\") or starts-with(., \"ensl-\") or starts-with(., \"hal-\") or starts-with(., \"HAL-\") or starts-with(., \"halsde-\") or starts-with(., \"halshs-\") or starts-with(., \"hprints-\") or starts-with(., \"in2p3-\") or starts-with(., \"ineris-\") or starts-with(., \"inria-\") or starts-with(., \"Inria-\") or starts-with(., \"inserm-\") or starts-with(., \"insu-\") or starts-with(., \"INSU-\") or starts-with(., \"ird-\") or starts-with(., \"irsn-\") or starts-with(., \"jpa-\") or starts-with(., \"lirmm-\") or starts-with(., \"medihal-\") or starts-with(., \"meteo-\") or starts-with(., \"mnhn-\") or starts-with(., \"obspm-\") or starts-with(., \"pastel-\") or starts-with(., \"pasteur-\") or starts-with(., \"Pasteur-\") or starts-with(., \"peer-\") or starts-with(., \"ssa-\") or starts-with(., \"tel-\") or starts-with(., \"ujm-\") or starts-with(., \"ijn_\") or starts-with(., \"sic_\") or (starts-with(., \"http\") and (contains(., \"://hal.archives-ouvertes.fr/hal\") or contains(., \"://halshs.archives-ouvertes.fr/hal\") or contains(., \"://halsde.archives-ouvertes.fr/hal\") or contains(., \"://medihal.archives-ouvertes.fr/hal\")))]", "//dc:relation[((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\")) and (contains(., \"://hal.archives-ouvertes.fr/hal\") or contains(., \"://halshs.archives-ouvertes.fr/hal\") or contains(., \"://halsde.archives-ouvertes.fr/hal\")))]"]' , xpath:"./*[local-name()='record']" , '((ads|anses|artxibo|bioemco|cea|cel|cirad|edutice|emse|EMSE|ensl|hal|HAL|halsde|halshs|hprints|in2p3|ineris|inria|Inria|inserm|insu|INSU|ird|irsn|jpa|lirmm|medihal|meteo|mnhn|obspm|pastel|pasteur|Pasteur|peer|ssa|tel|ujm)-|(ijn|sic)_).*');
$varIdHal = identifierExtract('["//*[local-name() = \"recordIdentifier\"]"]' , xpath:"./*[local-name()='record']" , '(oai:HAL:.*)');
$varIdArxv = identifierExtract('["//dc:identifier[((starts-with(., \"http\") or starts-with(., \"ArXiv: http\")) and (contains(., \"://arxiv.org/abs/\") or contains(., \"://arxiv.org/pdf/\"))) or starts-with(., \"arXiv:\") or starts-with(., \"ARXIV:\")]", "//dc:relation[(starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/arxiv/\") and not(contains(., \"/arxiv/http\"))) or ((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\") or starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/arxiv/http\")) and (contains(., \"://arxiv.org/abs/\") or contains(., \"://arxiv.org/pdf/\")))]"]' , xpath:"./*[local-name()='record']" , '(?!(://arxiv.org/abs/|:eu-repo/semantics/altIdentifier/arxiv/))([a-zA-Z].*)');
$varIdWos = identifierExtract('["//dc:identifier[starts-with(., \"WOS:\") or starts-with(., \"wos: WOS:\")]", "//dc:relation[starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/wos/\")]"]' , xpath:"./*[local-name()='record']" , '(info.*|WOS:.+|wos: WOS:.+)');
//oaf:identifier = set(xpath:"$varId//value[not[. = '10.1145/nnnnnnn.nnnnnnn']]", @identifierType = "doi";);
oaf:identifier = set(xpath:"$varIdDoi//value[not(. = '10.1145/nnnnnnn.nnnnnnn')]", @identifierType = "doi";);
oaf:identifier = set(xpath:"$varIdHdl//value", @identifierType = "handle";);
oaf:identifier = set(xpath:"$varIdIsbn", @identifierType = "isbn";);
oaf:identifier = set(xpath:"($varIdBibc//value[not(starts-with(., 'http'))]/replace(., 'BIBCODE:\s*', ''), $varIdBibc//value[starts-with(., 'http') and contains(substring-after(., 'bibcode='), codepoints-to-string(38))]/substring-before(substring-after(., 'bibcode='), codepoints-to-string(38)), $varIdBibc//value[starts-with(., 'http') and not(contains(substring-after(., 'bibcode='), codepoints-to-string(38)))]/substring-after(., 'bibcode='))", @identifierType = "bibcode";);
oaf:identifier = set(xpath:"$varIdPtnt//value/normalize-space(substring-after(., 'Patent N°:'))", @identifierType = "patentNumber";);
oaf:identifier = set(xpath:"$varPmId//value", @identifierType = "pmid";);
oaf:identifier = set(xpath:"$varIdPmc//value", @identifierType = "pmcid";);
//oaf:identifier = set(xpath:"distinct-values($varIdHal//value/replace(., '(/document|/image|/file/.*)$', ''))", @identifierType = "hal";);
oaf:identifier = set(xpath:"distinct-values($varIdHal//value/replace(substring-after(., 'oai:HAL:'), '(v\d*)$', ''))", @identifierType = "hal";);
oaf:identifier = set(xpath:"distinct-values(($varIdArxv//value/normalize-space(replace(., '(https?://arxiv.org/abs/|https?://arxiv.org/pdf/|info:eu-repo/semantics/altIdentifier/arxiv/|info:eu-repo/semantics/altIdentifier/url/|info:eu-repo/semantics/altIdentifier/urn/|arXiv:|\.pdf)', '', 'i'))))", @identifierType = "arxiv";);
oaf:identifier = set(xpath:"$varIdWos//value/normalize-space(replace(., '(info:eu-repo/semantics/altIdentifier/wos/|WOS:|wos:)', ''))", @identifierType = "wos";);
oaf:identifier = set(xpath:"distinct-values(//dc:identifier[starts-with(., 'http') and contains(., $varIdHal//value/replace(substring-after(., 'oai:HAL:'), '(v\d*)$', ''))]/replace(., '(/document|/image|/file/.*)$', ''))", @identifierType = "landingPage";);
oaf:identifier = set(xpath:"distinct-values(//dc:identifier[starts-with(., 'http') and not(ends-with(., $varIdHal//value/replace(substring-after(., 'oai:HAL:'), '(v\d*)$', '')))])", @identifierType = "url";);
oaf:identifier = set(xpath:"//dri:recordIdentifier", @identifierType = "oai-original";);
oaf:datasourceprefix = xpath:"//oaf:datasourceprefix";
// journal data
// avoiding regular expressions, while a) correcting ISSNs with no - or other letters instead of - and b) ignoring any stuff after the ISSN (as e.g. print/online/...)
$varISSN = xpath:"//dc:source[starts-with(., 'ISSN:') and string-length(.) &gt; 12]/concat(substring(normalize-space(substring-after(., 'ISSN:')), 1, 4), '-', normalize-space(substring-after(., substring(normalize-space(substring-after(., 'ISSN:')), 1, 4))))";
//$varEISSN = xpath:"//dc:source[starts-with(., 'EISSN:') and string-length(.) &gt; 13]/normalize-space(substring-after(., 'ISSN:'))";
$varEISSN = xpath:"//dc:source[starts-with(., 'EISSN:') and string-length(.) &gt; 13]/concat(substring(normalize-space(substring-after(., 'EISSN:')), 1, 4), '-', normalize-space(substring-after(., substring(normalize-space(substring-after(., 'EISSN:')), 1, 4))))";
oaf:journal = set(xpath:"//oaf:datasourceprefix[$varISSN or $varEISSN]/''", @issn = xpath:"$varISSN";, @eissn = xpath:"$varEISSN";);
end

View File

@ -0,0 +1,140 @@
// from PROD 2021-07-06 , tf script of DOAJ with more than 6mill. records
declare_script "dc_cleaning_OpenAIREplus_compliant_doaj";
declare_ns oaf = "http://namespace.openaire.eu/oaf";
declare_ns dri = "http://www.driver-repository.eu/namespace/dri";
declare_ns dr = "http://www.driver-repository.eu/namespace/dr";
declare_ns dc = "http://purl.org/dc/elements/1.1/";
declare_ns prov = "http://www.openarchives.org/OAI/2.0/provenance";
$var0 = "''";
$varFP7 = "'corda_______::'";
$varH2020 = "'corda__h2020::'";
$varDummy = "''";
// $varUnknownRepoId = "'openaire____::55045bd2a65019fd8e6741a755395c8c'";
//
$varUnknownRepoId = "'openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18'";
$varUnknownRepoName = "'Unknown Repository'";
static $varDatasourceid = getValue(PROFILEFIELD, [xpath:"concat('collection(&amp;apos;/db/DRIVER/RepositoryServiceResources&amp;apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&amp;quot;NamespacePrefix&amp;quot;][value=&amp;quot;', //oaf:datasourceprefix, '&amp;quot;]]')", xpath:"//EXTRA_FIELDS/FIELD[key='OpenAireDataSourceId']/value"]);
static $varRepoid = xpath:"//dri:repositoryId";
static $varOfficialname = getValue(PROFILEFIELD, [xpath:"concat('collection(&amp;apos;/db/DRIVER/RepositoryServiceResources&amp;apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&amp;quot;NamespacePrefix&amp;quot;][value=&amp;quot;', //oaf:datasourceprefix, '&amp;quot;]]')", xpath:"//CONFIGURATION/OFFICIAL_NAME"]);
dri:objIdentifier = xpath:"//dri:objIdentifier";
dri:repositoryId = $varRepoid;
dri:recordIdentifier = xpath:"//dri:recordIdentifier";
if xpath:"//dc:creator[string-length(normalize-space(.)) &amp;gt; 0][contains(., 'CDATA')][starts-with(normalize-space(.), '(')][starts-with(normalize-space(.), '.')]" dc:creator = skipRecord(); else $varDummy = "''";
//apply xpath:"//dc:creator" if xpath:"string-length(normalize-space(.)) &amp;amp;gt; 0 and not(contains(., 'CDATA')) and not(starts-with(normalize-space(.), '.')) and not(starts-with(normalize-space(.), '('))" dc:creator = Convert(xpath:".", Person); else $varDummy = "''";
if xpath:"count(//dc:creator) = 0" dc:creator = skipRecord(); else $varDummy = "''";
//apply xpath:"//dc:creator" if xpath:"string-length(.) &amp;gt; 0 and normalize-space(.) != ','" dc:creator = xpath:"normalize-space(.)"; else $varDummy = "''";
$varOrcidName = xpath:"//dc:creator[string-length(normalize-space(.)) &gt; 0]";
$varOrcidOrcid = xpath:"//dc:creator[string-length(normalize-space(.)) &gt; 0]/@id/replace(., 'https?://orcid.org/', '')";
dc:creator = set(xpath:"$varOrcidName", @nameIdentifier = xpath:"subsequence($varOrcidOrcid,position(),1)";, @nameIdentifierScheme=xpath:"replace(subsequence($varOrcidOrcid,position(),1),'^.+$','ORCID')";, @schemeUri=xpath:"replace(subsequence($varOrcidOrcid,position(),1),'^.+$','http://orcid.org/')";);
if xpath:"count(//dc:title[string-length(.) &amp;gt; 0]) = 0" dc:title = skipRecord(); else $varDummy = "''";
dc:title = xpath:"//dc:title/normalize-space(replace(., '^(&amp;lt;title language=)(.)*(&amp;gt;)', ''))";
// apply xpath:"//dc:title" if xpath:"string-length(.) &amp;gt; 0" dc:title = xpath:"normalize-space(.)"; else $varDummy = "''";
apply xpath:"//dc:subject" if xpath:"string-length(.) &amp;gt; 0 and not(@xsi:type = 'dcterms:LCSH')" dc:subject = xpath:"normalize-space(.)"; else $varDummy = "''";
dc:subject = set(xpath:"//dc:subject[@xsi:type = 'dcterms:LCSH']/concat('lcsh:', .)", @classid=xpath:"'lcsh'";, @classname=xpath:"'lcsh'";, @schemeid=xpath:"'dnet:subject_classification_typologies'";, @schemename=xpath:"'dnet:subject_classification_typologies'";);
apply xpath:"//dc:publisher" if xpath:"string-length(.) &amp;gt; 0" dc:publisher = xpath:"normalize-space(replace(., '(&amp;lt;br&amp;gt;)', ''))"; else $varDummy = "''";
apply xpath:"//dc:source" if xpath:"string-length(.) &amp;gt; 0" dc:source = xpath:"normalize-space(.)"; else $varDummy = "''";
dc:contributor = xpath:"//dc:contributor";
dc:description = xpath:"//dc:description[not(starts-with(., 'URN: urn:nbn:') or starts-with(., 'URN: http'))]";
dc:format = xpath:"//dc:format";
$varHttpTest = "''";
if xpath:"//dc:relation[starts-with(., 'http') or starts-with(., 'www.')]" $varHttpTest = "true"; else dc:identifier = skipRecord();
//apply xpath:"//dc:relation" if xpath:"starts-with(normalize-space(.), 'http')" dc:identifier = xpath:"normalize-space(.)"; else dr:CobjIdentifier = xpath:"normalize-space(.)";
//apply xpath:"//dc:relation" if xpath:"starts-with(normalize-space(.), 'www.')" dc:identifier = xpath:"concat('http://', normalize-space(.))"; else dr:CobjIdentifier = xpath:"normalize-space(.)";
dr:CobjIdentifier = xpath:"distinct-values(//dc:identifier[not(starts-with(normalize-space(.), 'http'))][not(normalize-space(.) = ($varIdList))][not(starts-with(normalize-space(.), 'urn:nbn:') or starts-with(normalize-space(.), 'URN:NBN:'))][not(. = ($varISSN[1], $varISSN[2]))][normalize-space(.) != ''])";
dc:identifier = xpath:"($varIdUrl//value[not(starts-with(., 'www'))], $varIdUrl//value[starts-with(., 'www')]/concat('http://', .), $varIdLdpg//value, $varIdDoi//value)[1]";
dc:relation = xpath:"//dc:relation[starts-with(., 'https://doaj.org/toc/')]";
dr:dateOfCollection = xpath:"//dri:dateOfCollection";
static dr:dateOfTransformation = xpath:"current-dateTime()";
// dc:type = xpath:"//dc:type";
dc:language = Convert(xpath:"//dc:language", Languages);
//if xpath:"//dc:rights[text()='info:eu-repo/semantics/openAccess']" dc:publisher = xpath:"//dc:publisher"; else dc:publisher = skipRecord();
dc:date = xpath:"//dc:date";
oaf:dateAccepted = Convert(xpath:"descendant-or-self::dc:date", DateISO8601, "yyyy-MM-dd", "min()");
apply xpath:"//dc:date" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/date')" oaf:embargoenddate = RegExpr(xpath:"normalize-space(.)", $var0, "s/^(.*info:eu-repo\/date\/embargoEnd\/)//gmi"); else $var0 = "''";
//apply xpath:"//dc:relation" if xpath:"string-length(substring-after(normalize-space(.), 'info:eu-repo/grantAgreement/EC/FP7/')) = 6" oaf:projectid = RegExpr(xpath:"normalize-space(.)", $var1, "s/^(.*info:eu-repo\/grantAgreement\/EC\/FP7\/)//gmi"); else dc:relation = xpath:"normalize-space(.)";
//comment-js-09-10-2012 apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics')" dc:rights = empty; else dc:rights = xpath:"normalize-space(.)";
//
oaf:collectedDatasourceid = $varDatasourceid;
//
// apply xpath:"//dc:type" if xpath:"." dr:CobjCategory = Convert(xpath:"normalize-space(.)", TextTypologies); else dc:type = xpath:".";
//dr:CobjCategory = "0001";
$varCobjCategory = Convert(xpath:"//dc:type", TextTypologies);
$varSuperType = Convert(xpath:"normalize-space($varCobjCategory)", SuperTypes);
dr:CobjCategory = set($varCobjCategory, @type = $varSuperType;);
dc:type = xpath:"//dc:type";
//
// review status
$varRefereedIdntf = xpath:"(//*[string(node-name(.)) = 'dc:identifier' and matches(., '^(https?://(dx\.)?doi.org/)?10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$')]/'0001', //*[string(node-name(.)) = 'dc:relation' and matches(., '^info:eu-repo/semantics/altIdentifier/doi/10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$', 'i')]/'0001')";
$varRefereedProse = xpath:"(//*[string(node-name(.)) = 'dc:description' and matches(lower-case(.), '.*this\s*preprint\s*has\s*been\s*reviewed\s*and\s*recommended\s*by\s*peer\s*community') and contains(., '10.24072/')]/'0001', //dc:title[matches(lower-case(.), '.*\[.*peer[\s\-\._]*review\s*:.*\]\s*$')]/'0001')";
$varRefereedReltn = xpath:"(//dc:relation, //dc:identifier)[contains(., '://www.dovepress.com/') and matches(lower-case(.), '.*-peer-reviewed-(fulltext-)?article-.*')]/'0001'";
$varRefereedTitle = xpath:"//dc:title[matches(lower-case(.), '.*\[.*peer[\s\-\._]*review\s*:.*\]\s*$')]/'0001'";
$varRefereedDesct = xpath:"(//dc:description[matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*') or matches(lower-case(.), '(this|a)\s*(article|preprint)\s*(has\s*been\s*)?(peer[\-\s]*)?reviewed\s*and\s*recommended\s*by\s*peer[\-\s]*community')]/'0001')";
$varRefereed = xpath:"($varRefereedIdntf, $varRefereedProse, $varRefereedReltn, $varRefereedTitle, $varRefereedDesct)";
//if xpath:"$varRefereed" oaf:refereed = xpath:"'0001'"; else $varDummy= "''";
if xpath:"count(index-of($varRefereed, '0001')) &gt;0" oaf:refereed = xpath:"'0001'"; else $varDummy= "''";
if xpath:"count(index-of($varRefereed, '0002')) &gt;0 and count(index-of($varRefereed, '0001')) = 0" oaf:refereed = xpath:"'0002'"; else $varDummy= "''";
//
apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics')" oaf:accessrights = Convert(xpath:"normalize-space(.)", AccessRights); else dc:rights = xpath:".";
if xpath:"//dc:rights[starts-with(normalize-space(.), 'info:eu-repo/semantics')]" $var0 = "''"; else oaf:accessrights = "OPEN";
//if xpath:"count(//dc:rights) = 0" oaf:accessrights = "OPEN"; else $var0 = "''";
// oaf:accessrights = Convert(xpath:"normalize-space(//dc:rights)", AccessRights);
oaf:license = xpath:"(//dc:rights, //dc:relation)[starts-with(normalize-space(.), 'http') and (contains(., '/licenses/') or contains(., '/licence/') or contains(., '/licencias/') or contains(., '/licencia/') or contains(., '://creativecommons.org/') or contains(., '://rightsstatements.org/')) or matches(., '^CC[- ]BY([- ](NC([- ](ND|SA))?|ND|SA))([- ]\d(\.\d)?)?$', 'i')][not(contains(normalize-space(.), ' '))]/normalize-space(.)";
//
static oaf:collectedFrom = set("''", @name = $varOfficialname; , @id = $varDatasourceid;);
static oaf:hostedBy = set("''", @name = $varOfficialname; , @id = $varDatasourceid;);
//
//$varId = identifierExtract('["//dc:identifier", "//dc:relation"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/&amp;lt;&amp;gt;]*/[^\s"&amp;lt;&amp;gt;]+)');
$varIdDoi = identifierExtract('["//dc:identifier[starts-with(., \"10.\") or starts-with(., \"DOI:\") or starts-with(., \"doi:\") or (starts-with(., \"http\") and contains(., \"doi.org/\"))]", "//dc:relation[starts-with(., \"10.\") or starts-with(., \"DOI:\") or starts-with(., \"doi:\") or (starts-with(., \"http\") and contains(., \"doi.org/\"))]"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/&lt;&gt;]*/[^\s"&lt;&gt;]+)');
$varIdHdl = identifierExtract('["//dc:relation[starts-with(., \"http\") and contains(., \"://hdl.handle.net/\")][not(contains(., \"123456789\"))]"]' , xpath:"./*[local-name()='record']" , '(?!(://hdl.handle.net/))(\d.*)');
$varIdUrn = identifierExtract('["//dc:relation[starts-with(., \"urn:nbn:\") or starts-with(., \"URN:NBN:\") or (starts-with(., \"http\") and (contains(., \"://nbn-resolving.org/urn:nbn:\") or contains(., \"://nbn-resolving.de/urn/resolver.pl?urn:nbn:\") or contains(., \"://nbn-resolving.de/urn:nbn:\") or contains(., \"://resolver.obvsg.at/urn:nbn:\") or contains(., \"://urn.fi/URN:NBN:\") or contains(., \"://urn.kb.se/resolve?urn=urn:nbn:\")))]", "//dc:description[contains(., \"URN: urn:nbn:de:0114-\") or contains(., \"URN: http://nbn-resolving.de/urn:nbn:de:0114-\") or (contains(., \"URN:NBN:no-\") and //dc:identifier = \"1893-1774\")]"]' , xpath:"./*[local-name()='record']" , '((urn:nbn:|URN:NBN:).*)');
$varIdArk = identifierExtract('["//dc:relation[starts-with(normalize-space(.), \"http\") and contains(., \"/ark:\")]"]' , xpath:"./*[local-name()='record']" , '(http.*)');
$varIdPmid = identifierExtract('["//dc:relation[starts-with(., \"http\") and contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/pmid/\")]"]' , xpath:"./*[local-name()='record']" , '(http.*)');
$varIdPmc = identifierExtract('["//dc:relation[starts-with(., \"http\") and (contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/PMC\") or contains(., \"//europepmc.org/articles/PMC\"))]"]' , xpath:"./*[local-name()='record']" , '(http.*)');
$varIdHal = identifierExtract('["//dc:relation[starts-with(., \"hal-\") or starts-with(., \"halshs-\") or starts-with(., \"halsde-\") or (starts-with(., \"http\") and (contains(., \"://hal.archives-ouvertes.fr/hal\") or contains(., \"://halshs.archives-ouvertes.fr/hal\") or contains(., \"://halsde.archives-ouvertes.fr/hal\")))]"]' , xpath:"./*[local-name()='record']" , '(hal(shs|sde)?-.*)');
$varIdArxv = identifierExtract('["//dc:relation[starts-with(., \"http\") and (contains(., \"://arxiv.org/pdf/\") or contains(., \"://arxiv.org/abs/\"))]"]' , xpath:"./*[local-name()='record']" , '(\d.*)');
$varIdLdpg = identifierExtract('["//dc:identifier[starts-with(., \"https://doaj.org/article/\")]"]', xpath:"./*[local-name()='record']" , '(http.*)');
$varIdUrl = identifierExtract('["//dc:relation[starts-with(., \"http\")][not(contains(., \"://doaj.org\"))][not(contains(., \"doi.org/\"))][not(contains(., \"hdl.handle.net/\"))][not(contains(., \"://nbn-resolving.de/\") or contains(., \"://nbn-resolving.org/\") or contains(., \"://resolver.obvsg.at/\") or contains(., \"://urn.fi/URN:NBN:\") or contains(., \"://urn.kb.se/resolve\"))][not(contains(., \"://arxiv.org/pdf/\") or contains(., \"://arxiv.org/abs/\"))][not(contains(., \"://localhost/\") or contains(., \"://localhost:\"))]", "//dc:relation[starts-with(., \"www\")]"]', xpath:"./*[local-name()='record']" , '((http|www).*)');
$varIdList = xpath:"(($varIdDoi//value, $varIdHdl//value, $varIdUrn//value, $varIdArk//value, $varIdPmid//value, $varIdPmc//value, $varIdLdpg//value, $varIdUrl//value))";
// dropping/cleaning wrong DOIs, as
// 2 DOIs just different in 1 ending with . (mostly, but not exclusively, prefixed with 10.5216)
// noise stemming from odd/wrong DOI statements' formats
// DOIs with 2 prefixes
// DOI statements containing first the DOI prefix and then the DOI incl. the resolver prefix
//oaf:identifier = set(xpath:"$varId//value", @identifierType = "doi";);
//oaf:identifier = set(xpath:"$varIdDoi//value", @identifierType = "doi";);
oaf:identifier = set(xpath:"distinct-values(($varIdDoi//value[not(ends-with(., '.') and exists(index-of($varIdDoi//value, substring(., 1, string-length(.)-1))))][not(. = '10.4313/article-4')][not(lower-case(.) = ('10.30659/ijibe.2.1.171-181', '10.30659/ijibe.2.1.171', '10.26843/rencima.v8i4.149', '10.26843/rencima.v11i1.215', '10.18273/revfue.v14n2-2016002revista', '10.17061/phrp3112015', '10.21789/24222704', '10.22432/pjsr.2017.14.', '10.22432/pjsr.2017.18.02', '10.22432/pjsr.2017.18.'))][not(starts-with(., '10.1530/VAB-'))][not(starts-with(lower-case(.), '10.1155/s168761720'))][not(starts-with(., '10.15561/10.6084/') or starts-with(., '10.5935/10.19180/'))][not(starts-with(., '10.7454/jvi.v') and string-length(.) = 16)][not(starts-with(., '10.15094/0000') and string-length(.) = 16)][not(matches(., '^10\.\d*/DOI:$'))][not(starts-with(., concat(substring-before(., '/'), '/', substring-before(., '/'), '/')))][not(matches(substring-after(., '/'), '^https?://(dx.)?doi.org/.*') and starts-with(substring-after(., 'doi.org/'), substring-before(., '/')))][not(starts-with(., '10.1371/journal.') and matches(., '^10\.1371/journal\.[a-z]{4}\.\d{7}\.(eor|20050521)$'))][not(substring-before(., '/') = ('10.19183', '10.18066') and matches(., '^(10\.19183/how\.\d*\.\d*|10\.18066/revunivap\.v\d*i\d*)$'))]/lower-case(.), $varIdDoi//value[matches(substring-after(., '/'), '^https?://(dx.)?doi.org/.*') and starts-with(substring-after(., 'doi.org/'), substring-before(., '/'))]/substring-after(., 'doi.org/'), $varIdDoi//value[starts-with(., '10.1371/journal.') and matches(., '^10\.1371/journal\.[a-z]{4}\.\d{7}\.eor$')]/substring(., 1, 28), $varIdDoi//value[starts-with(., '10.15561/10.6084/') or starts-with(., '10.5935/10.19180/')]/substring-after(., '/')))", @identifierType = "doi";);
oaf:identifier = set(xpath:"distinct-values($varIdHdl//value/normalize-space(replace(., '\?locatt=view:master', '')))", @identifierType = "handle";);
oaf:identifier = set(xpath:"$varIdUrn//value", @identifierType = "urn";);
oaf:identifier = set(xpath:"distinct-values($varIdArk//value/replace(substring-after(., '/ark:'), '^/', ''))", @identifierType = "ark";);
oaf:identifier = set(xpath:"distinct-values($varIdPmid//value/replace(., 'https?://www.ncbi.nlm.nih.gov/pmc/articles/pmid/(\d+)(/.*)?', '$1'))", @identifierType = "pmid";);
oaf:identifier = set(xpath:"distinct-values($varIdPmc//value/replace(., 'https?://(www.ncbi.nlm.nih.gov/pmc|europepmc.org)/articles/(PMC\d*)([/\?].*)?', '$2'))", @identifierType = "pmcid";);
oaf:identifier = set(xpath:"distinct-values($varIdHal//value/replace(., '/document', ''))", @identifierType = "hal";);
oaf:identifier = set(xpath:"$varIdArxv//value", @identifierType = "arxiv";);
oaf:identifier = set(xpath:"$varIdLdpg//value", @identifierType = "landingPage";);
oaf:identifier = set(xpath:"($varIdUrl//value[not(starts-with(., 'www'))], $varIdUrl//value[starts-with(., 'www')]/concat('http://', .))", @identifierType = "url";);
oaf:datasourceprefix = xpath:"//oaf:datasourceprefix";
//$varJournalName = xpath:"substring-before(//dc:source, ',')";
$varJournalTitle = xpath:"(//dc:source[contains(., ', Vol ')]/substring-before(., ', Vol '), //dc:source[contains(., ', Iss ')]/substring-before(., ', Iss '))[1]";
$varVol = xpath:"//dc:source[contains(., ', Vol ')][matches(., ', Vol \d+')]/replace(substring-after(., ', Vol '), '^(\d+).*$', '$1')";
$varIss = xpath:"//dc:source[contains(., ', Iss ')][matches(., ', Iss \d+')]/replace(substring-after(., ', Iss '), '^(\d+).*$', '$1')";
$varSp = xpath:"//dc:source[contains(., ', Pp ')][matches(., ', Pp \d+-\d+')]/substring-before(substring-after(., ', Pp '), '-')";
$varEp = xpath:"//dc:source[contains(., ', Pp ')][matches(., ', Pp \d+-\d+')]/replace(substring-after(substring-after(., ', Pp '), '-'), '^(\d+).*$', '$1')";
$varISSN = xpath:"//dc:identifier[string-length() = 9 and matches(., '(\d{4})-(\d{4})')][1]";
//oaf:journal = set($varJournalName, @issn = xpath:"//dc:identifier[string-length() = 9 and matches(., '^(\d{4})-(\d{4}|\d{3}X)')][1]"; , @eissn = xpath:"//dc:identifier[string-length() = 9 and matches(., '^(\d{4})-(\d{4}|\d{3}X)')][2]";);
//oaf:journal = set($varJournalName, @issn = xpath:"//dc:identifier[string-length() = 9]";);
oaf:journal = set($varJournalTitle, @issn = xpath:"//dc:identifier[string-length() = 9 and matches(., '(\d{4})-(\d{4})')][1]";, @eissn = xpath:"//dc:identifier[string-length() = 9 and matches(., '(\d{4})-(\d{4})')][2]";, @vol = xpath:"$varVol";, @iss = xpath:"$varIss";, @sp = xpath:"$varSp";, @ep = xpath:"$varEp";);
end

View File

@ -0,0 +1,492 @@
<!-- from PROD 2021-06-14 -->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.1"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:transformExt="http://namespace.openaire.eu/java/org.apache.commons.codec.digest.DigestUtils"
xmlns:TransformationFunction="eu.dnetlib.data.collective.transformation.core.xsl.ext.TransformationFunctionProxy"
extension-element-prefixes="transformExt TransformationFunction"
exclude-result-prefixes="transformExt TransformationFunction" >
<xsl:output indent="yes" omit-xml-declaration="yes"/>
<!--
<xsl:param name="varHostedById" select="'opendoar____::908'"/>
<xsl:param name="varHostedByName" select="'Europe PubMed Central'"/>
-->
<xsl:param name="varOfficialName" />
<xsl:param name="varDsType" />
<xsl:param name="varDataSourceId" />
<xsl:param name="varFP7FundRefDOI" select="'10.13039/501100004963'"/>
<xsl:param name="varFP7OtherDOI" select="'10.13039/100011102'"/>
<xsl:param name="varH2020FundRefDOI" select="'10.13039/501100007601'"/>
<xsl:param name="varFP7" select="'corda_______::'"/>
<xsl:param name="varH2020" select="'corda__h2020::'"/>
<xsl:param name="epmcUrlPrefix" select="'http://europepmc.org/articles/'" />
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')"/>
<xsl:param name="index" select="0"/>
<xsl:param name="transDate" select="current-dateTime()"/>
<xsl:variable name="tf" select="TransformationFunction:getInstance()"/>
<xsl:variable name="year" select="format-number( ( //*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='year'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='year']), '0000')" />
<xsl:variable name="month" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='month'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='month']), '00')" />
<xsl:variable name="day" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='day'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='day']), '00')" />
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template match="/">
<record>
<xsl:apply-templates select="//*[local-name() = 'header']" />
<metadata>
<xsl:if test="not(//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0])">
<xsl:call-template name="terminate"/>
</xsl:if>
<!-- in journal.fi xml:lang of translated titles is not within the trans-title element but within the surrounding trans-title-group element (which just contains 1 trans-title element) -->
<!--
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']//*[local-name()=('article-title', 'trans-title-group')][string-length(normalize-space(.))> 0]"/>
<xsl:with-param name="targetElement" select="'dc:title'"/>
</xsl:call-template>
-->
<xsl:call-template name="title">
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name()='title-group']//*[local-name()=('article-title', 'trans-title', 'subtitle', 'trans-subtitle')]"/>
</xsl:call-template>
<xsl:call-template name="authors">
<!--
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']/*[local-name() = 'contrib'][@contrib-type='author'][not(exists(child::*:collab))]"/>
-->
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group'][@content-type='author']/*[local-name() = 'contrib']"/>
</xsl:call-template>
<!-- <xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:contributor"/>
<xsl:with-param name="targetElement" select="'dc:contributor'"/>
</xsl:call-template>
-->
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-meta']/*[local-name()=('abstract', 'trans-abstract')]"/>
<xsl:with-param name="targetElement" select="'dc:description'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-categories']//*[local-name()='subject']"/>
<xsl:with-param name="targetElement" select="'dc:subject'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='kwd-group']//*[local-name()='kwd']"/>
<xsl:with-param name="targetElement" select="'dc:subject'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='publisher']/*[local-name()='publisher-name']"/>
<xsl:with-param name="targetElement" select="'dc:publisher'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
<xsl:with-param name="targetElement" select="'dc:source'"/>
</xsl:call-template>
<xsl:element name="dc:language">
<xsl:value-of select="//*[local-name()='metadata']//*[local-name()='article']/@xml:lang" />
</xsl:element>
<xsl:element name="dc:identifier">
<xsl:value-of select="//*[local-name()='article-meta']/*[local-name()='self-uri'][contains(./@xlink:href, '/view/')]/@xlink:href" />
</xsl:element>
<xsl:element name="oaf:dateAccepted">
<!--
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day'])" />
<xsl:value-of select="TransformationFunction:Convert($tf, //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub'], 'DateISO8601', 'yyyy-MM-dd', 'min()')" />
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/replace(concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day']), '-(\d)([-$])', '-0$1$2')" />
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/
concat(./*[local-name()='year'], '-',
substring(concat('0', ./*[local-name()='month'], '1'), string-length(./*[local-name()='month']), 2), '-',
substring(concat('0', ./*[local-name()='day'], '1'), string-length(./*[local-name()='day']), 2))" />
-->
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/
concat(./*[local-name()='year'], '-',
substring(concat('0', ./*[local-name()='month'], '1'), string-length(./*[local-name()='month']) idiv 2 + 1, 2), '-',
substring(concat('0', ./*[local-name()='day'], '1'), string-length(./*[local-name()='day']) idiv 2 +1, 2))" />
</xsl:element>
<xsl:for-each select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub']">
<xsl:choose>
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4] and ./*[local-name()='month' and string-length(normalize-space(.)) = 2] and ./*[local-name()='day' and string-length(normalize-space(.)) = 2]">
<dc:date>
<xsl:value-of select="concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day'])"/>
</dc:date>
</xsl:when>
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4] and ./*[local-name()='month' and string-length(normalize-space(.)) = 2]">
<dc:date>
<xsl:value-of select="concat(./*[local-name()='year'], '-', ./*[local-name()='month'])"/>
</dc:date>
</xsl:when>
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4]">
<dc:date>
<xsl:value-of select="./*[local-name()='year']"/>
</dc:date>
</xsl:when>
</xsl:choose>
</xsl:for-each>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()='meta-value'], //*[local-name()='permissions']/*[local-name()='copyright-statement']"/>
<xsl:with-param name="targetElement" select="'dc:rights'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='permissions']/*[local-name()='license']/@xlink:href"/>
<xsl:with-param name="targetElement" select="'oaf:license'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='fn-group']//*[local-name()='fn']"/>
<xsl:with-param name="targetElement" select="'dc:relation'"/>
</xsl:call-template>
<xsl:call-template name="identifiers">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-id']"/>
</xsl:call-template>
<xsl:for-each select="//*[local-name()='article-meta']/*[local-name()='self-uri'][not(./@content-type = 'application/pdf')]/@xlink:href">
<oaf:identifier>
<xsl:attribute name="identifierType">
<xsl:text>landingPage</xsl:text>
</xsl:attribute>
<xsl:value-of select="."/>
</oaf:identifier>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='article-meta']/*[local-name()='self-uri' and ./@content-type='application/pdf' and //oaf:datasourceprefix = ('ambientesust', 'qualityinedu')]/@xlink:href/replace(., '/view/', '/download/')">
<oaf:fulltext>
<xsl:value-of select="."/>
</oaf:fulltext>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varFP7FundRefDOI) or ends-with(., $varFP7OtherDOI)]]">
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
<oaf:projectid>
<xsl:value-of select="concat($varFP7, ./*[local-name()='award-id'])"/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varH2020FundRefDOI)]]">
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
<oaf:projectid>
<xsl:value-of select="concat($varH2020, ./*[local-name()='award-id'])"/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<!-- -->
<xsl:variable name='varRights' select="distinct-values((for $i in (
//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'license']/@xlink:href,
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read'
and not( ./@start_date[(xs:date( max( (string(.), '0001-01-01') ) ) gt current-date())])
and not( ./@end_date[(xs:date( max( (string(.), '0001-01-01') ) ) lt current-date())])]/'open',
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read'
and (( ./@start_date[(xs:date( max( (string(.), '0001-01-01') ) ) gt current-date())])
or ( ./@end_date[(xs:date( max( (string(.), '0001-01-01') ) ) lt current-date())]))]/'embargo')
return TransformationFunction:convertString($tf, normalize-space($i), 'AccessRights')))" />
<!--
and not((xs:date( max( (start_date, '0001-01-01') ) ) gt current-date()))
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read' and and not((xs:date( max( (./@start_date, '0001-01-01') ) ) gt current-date()))]/'open'
-->
<oaf:accessrights>
<xsl:choose>
<xsl:when test="$varRights[. = 'EMBARGO']">
<xsl:value-of select="'EMBARGO'"/>
</xsl:when>
<xsl:when test="$varRights[. != 'UNKNOWN']">
<xsl:value-of select="$varRights[. != 'UNKNOWN'][1]"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$varRights[1]"/>
</xsl:otherwise>
</xsl:choose>
</oaf:accessrights>
<!--
<oaf:accessrights>
<xsl:value-of select="$varRights[1]"/>
</oaf:accessrights>
<xsl:element name="oaf:accessrights">
<xsl:value-of select="(//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'license']/@xlink:href)/TransformationFunction:convertString($tf, ., 'AccessRights')" />
</xsl:element>
-->
<!--
<xsl:element name="dr:CobjCategory">
<xsl:variable name='varCobjCategory' select="TransformationFunction:convertString($tf, //*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()='meta-value'], 'TextTypologies')" />
<xsl:variable name='varSuperType' select="TransformationFunction:convertString($tf, $varCobjCategory, 'SuperTypes')" />
<xsl:attribute name="type" select="$varSuperType"/>
<xsl:value-of select="$varCobjCategory" />
</xsl:element>
<xsl:variable name='varCobjCatLst' select="for $i in (
//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type)
return TransformationFunction:convertString($tf, normalize-space($i), 'TextTypologies')" />
-->
<xsl:variable name='varTypLst' select="distinct-values((//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type))"/>
<xsl:variable name='varCobjCatLst' select="distinct-values((for $i in $varTypLst
return TransformationFunction:convertString($tf, normalize-space($i), 'TextTypologies')))" />
<xsl:variable name='varCobjSupLst' select="for $i in $varCobjCatLst
return concat($i, '###', TransformationFunction:convertString($tf, normalize-space($i), 'SuperTypes'))" />
<dr:CobjCategory>
<xsl:choose>
<xsl:when test="count($varCobjSupLst[not(substring-after(., '###') = 'other') and not(substring-before(., '###') = ('0038', '0039', '0040'))]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-after(., '###') = 'other') and not(substring-before(., '###') = ('0038', '0039', '0040'))][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:when test="count($varCobjSupLst[not(substring-after(., '###') = 'other')]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-after(., '###') = 'other')][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:when test="count($varCobjSupLst[not(substring-before(., '###') = ('0020', '0000'))]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-before(., '###') = ('0020', '0000'))][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:when test="count($varCobjSupLst[not(substring-before(., '###') = ('0000'))]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-before(., '###') = ('0000'))][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:otherwise>
<xsl:attribute name="type" select="'other'"/>
<xsl:value-of select="'0000'" />
</xsl:otherwise>
</xsl:choose>
</dr:CobjCategory>
<!--
<xsl:for-each select="$varCobjSupLst">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
-->
<xsl:for-each select="$varTypLst">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
<!--
<xsl:for-each select="(//*[local-name()='article']/@article-type, //*[local-name() = 'custom-meta' and ./@specific-use = 'resource-type']/*[local-name() = ('meta-value', 'meta-name')])">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
-->
<oaf:language>
<xsl:value-of select="TransformationFunction:convertString($tf, //*[local-name()='metadata']//*[local-name()='article']/@xml:lang, 'Languages')" />
</oaf:language>
<!-- review status -->
<!-- ToDo:
review status
~ ask Journal.fi to put it elsewhere
~ evaluate article-version (no example found yet)
subject/kwd:
~ handle thesauri (no example found yet)
relations:
~ handle fn (no example found yet)
-->
<!--
<xsl:variable name="varRefereedConvt" select="for $i in (
//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type)
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
-->
<xsl:variable name="varRefereedConvt" select="for $i in ($varTypLst)
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
<xsl:variable name="varRefereedDescp" select="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]/'0001'"/>
<xsl:variable name="varRefereedSubjt" select="//*[local-name() = 'article-categories' and contains(//dri:recordIdentifier, 'oai:journal.fi')]/*[local-name() = 'subj-group' and ./@subj-group-type='heading']/*[local-name() = 'subject' and . = 'Peer reviewed articles']/'0001'"/>
<xsl:variable name="varRefereed" select="($varRefereedConvt, $varRefereedDescp, $varRefereedSubjt)"/>
<!--
<oaf:refereed>
<xsl:value-of select="$varRefereedDescp"/>
</oaf:refereed>
<oaf:refereed>
<xsl:value-of select="$varRefereed"/>
</oaf:refereed>
<oaf:refereed>
<xsl:value-of select="count($varRefereed[. = '0001']) > 0"/>
</oaf:refereed>
-->
<xsl:choose>
<xsl:when test="count($varRefereed[. = '0001']) > 0">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="count($varRefereed[. = '0002']) > 0">
<oaf:refereed>
<xsl:value-of select="'0002'"/>
</oaf:refereed>
</xsl:when>
</xsl:choose>
<xsl:call-template name="journal">
<xsl:with-param name="journalTitle" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
<xsl:with-param name="issn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='ppub']"/>
<xsl:with-param name="eissn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='epub']"/>
<xsl:with-param name="vol" select="//*[local-name()='article-meta']/*[local-name()='volume']"/>
<xsl:with-param name="issue" select="//*[local-name()='article-meta']/*[local-name()='issue']"/>
<xsl:with-param name="sp" select="//*[local-name()='article-meta']/*[local-name()='fpage']"/>
<xsl:with-param name="ep" select="//*[local-name()='article-meta']/*[local-name()='lpage']"/>
</xsl:call-template>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:collectedFrom>
</metadata>
<xsl:copy-of select="//*[local-name() = 'about']" />
</record>
</xsl:template>
<xsl:template name="allElements">
<xsl:param name="sourceElement"/>
<xsl:param name="targetElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="{$targetElement}">
<xsl:if test="(.[@xml:lang] or ..[@xml:lang]) and $targetElement = ('dc:title', 'dc:description', 'dc:subject')">
<xsl:attribute name="xml:lang">
<xsl:value-of select="(./@xml:lang, ../@xml:lang)[1]"/>
</xsl:attribute>
</xsl:if>
<xsl:value-of select="normalize-space(.)"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template name="title">
<xsl:param name="sourceElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="dc:title">
<xsl:if test=".[@xml:lang] or ..[@xml:lang]">
<xsl:attribute name="xml:lang">
<xsl:value-of select="(./@xml:lang, ../@xml:lang)[1]"/>
</xsl:attribute>
</xsl:if>
<xsl:value-of select="string-join((., ./following-sibling::*[local-name() = ('subtitle', 'trans-subtitle')])/normalize-space(.), ': ')"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template name="journal">
<xsl:param name="journalTitle"/>
<xsl:param name="issn"/>
<xsl:param name="eissn"/>
<xsl:param name="vol"/>
<xsl:param name="issue"/>
<xsl:param name="sp"/>
<xsl:param name="ep"/>
<xsl:element name="oaf:journal">
<xsl:attribute name="issn">
<xsl:value-of select="normalize-space($issn)"/>
</xsl:attribute>
<xsl:attribute name="eissn">
<xsl:value-of select="normalize-space($eissn)"/>
</xsl:attribute>
<xsl:attribute name="vol">
<xsl:value-of select="normalize-space($vol)"/>
</xsl:attribute>
<xsl:attribute name="iss">
<xsl:value-of select="normalize-space($issue)"/>
</xsl:attribute>
<xsl:attribute name="sp">
<xsl:value-of select="normalize-space($sp)"/>
</xsl:attribute>
<xsl:attribute name="ep">
<xsl:value-of select="normalize-space($ep)"/>
</xsl:attribute>
<xsl:value-of select="normalize-space($journalTitle)"/>
</xsl:element>
</xsl:template>
<xsl:template name="identifiers">
<xsl:param name="sourceElement"/>
<xsl:if test="string-length($sourceElement[@pub-id-type='doi']) gt 0">
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>doi</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type='doi']"/>
</xsl:element>
</xsl:if>
</xsl:template>
<xsl:template name="authors">
<xsl:param name="sourceElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="dc:creator">
<xsl:if test="./*[local-name()='contrib-id'][@contrib-id-type='orcid']">
<xsl:attribute name="nameIdentifierScheme">
<xsl:text>ORCID</xsl:text>
</xsl:attribute>
<xsl:attribute name="schemeURI">
<xsl:text>http://orcid.org/</xsl:text>
</xsl:attribute>
<xsl:attribute name="nameIdentifier">
<xsl:value-of select="substring-after(./*[local-name()='contrib-id'][@contrib-id-type='orcid'], 'http://orcid.org/')"/>
</xsl:attribute>
</xsl:if>
<xsl:value-of select="concat(normalize-space(./*[local-name()='name']/*[local-name()='surname']), ', ', normalize-space(./*[local-name()='name']/*[local-name()='given-names']))"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template match="//*[local-name() = 'header']">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate"/>
</xsl:element>
</xsl:copy>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>

View File

@ -0,0 +1,437 @@
<!-- from production 2021-0614 -->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.1"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:transformExt="http://namespace.openaire.eu/java/org.apache.commons.codec.digest.DigestUtils"
xmlns:TransformationFunction="eu.dnetlib.data.collective.transformation.core.xsl.ext.TransformationFunctionProxy"
extension-element-prefixes="transformExt TransformationFunction"
exclude-result-prefixes="transformExt TransformationFunction" >
<xsl:output indent="yes" omit-xml-declaration="yes"/>
<xsl:param name="varHostedById" select="'opendoar____::908'"/>
<xsl:param name="varHostedByName" select="'Europe PubMed Central'"/>
<xsl:param name="varOfficialName" />
<xsl:param name="varDsType" />
<xsl:param name="varDataSourceId" />
<xsl:param name="varFP7FundRefDOI" select="'10.13039/501100004963'"/>
<xsl:param name="varH2020FundRefDOI" select="'10.13039/501100007601'"/>
<xsl:param name="varFP7" select="'corda_______::'"/>
<xsl:param name="varH2020" select="'corda__h2020::'"/>
<xsl:param name="epmcUrlPrefix" select="'http://europepmc.org/articles/'" />
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')"/>
<xsl:param name="index" select="0"/>
<xsl:param name="transDate" select="current-dateTime()"/>
<xsl:variable name="tf" select="TransformationFunction:getInstance()"/>
<xsl:variable name="year" select="format-number( ( //*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='year'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='year']), '0000')" />
<xsl:variable name="month" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='month'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='month']), '00')" />
<xsl:variable name="day" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='day'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='day']), '00')" />
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template match="/">
<record>
<xsl:apply-templates select="//*[local-name() = 'header']" />
<metadata>
<xsl:if test="not(//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0])">
<xsl:call-template name="terminate"/>
</xsl:if>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0]"/>
<xsl:with-param name="targetElement" select="'dc:title'"/>
</xsl:call-template>
<xsl:call-template name="authors">
<!--
<xsl:with-param name="sourceElement" select="//*[local-name() = 'contrib'][@contrib-type='author']"/>
-->
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']/*[local-name() = 'contrib'][@contrib-type='author'][not(exists(child::*:collab))][./*[local-name()='name'] or ./*[local-name()='name-alternatives']/*[local-name()='name']][string-length(.//*[local-name()='surname']) + string-length(.//*[local-name()='given-names']) > 0]"/>
</xsl:call-template>
<!-- <xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:contributor"/>
<xsl:with-param name="targetElement" select="'dc:contributor'"/>
</xsl:call-template>
-->
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-meta']/*[local-name()='abstract']"/>
<xsl:with-param name="targetElement" select="'dc:description'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-categories']//*[local-name()='subject']"/>
<xsl:with-param name="targetElement" select="'dc:subject'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='kwd-group' and not(lower-case(@kwd-group-type)=('mesh', 'ocis'))]//*[local-name()='kwd']"/>
<xsl:with-param name="targetElement" select="'dc:subject'"/>
</xsl:call-template>
<xsl:for-each select="//*[local-name()='kwd-group' and lower-case(@kwd-group-type)='mesh' and ./*[local-name()='kwd']]">
<xsl:for-each select="./*[local-name()='kwd']">
<dc:subject>
<xsl:attribute name="subjectScheme" select="'mesh'"/>
<xsl:attribute name="schemeURI" select="'http://www.nlm.nih.gov/mesh/'"/>
<xsl:attribute name="valueURI" select="''"/>
<xsl:value-of select="./concat('mesh:', replace(., 'mesh (.*)$', '$1'))"/>
</dc:subject>
</xsl:for-each>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='kwd-group' and lower-case(@kwd-group-type)='ocis' and ./*[local-name()='kwd']]">
<xsl:for-each select="./*[local-name()='kwd']">
<dc:subject>
<xsl:attribute name="subjectScheme" select="'ocis'"/>
<xsl:attribute name="schemeURI" select="''"/>
<xsl:attribute name="valueURI" select="''"/>
<xsl:value-of select="./concat('ocis:', .)"/>
</dc:subject>
</xsl:for-each>
</xsl:for-each>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='publisher']/*[local-name()='publisher-name']"/>
<xsl:with-param name="targetElement" select="'dc:publisher'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
<xsl:with-param name="targetElement" select="'dc:source'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/(*[local-name() = 'article-version-alternatives']/*[local-name() = 'article-version'], *[local-name() = 'article-version'])/concat('article-version (', @article-version-type, ') ', .)"/>
<xsl:with-param name="targetElement" select="'dc:source'"/>
</xsl:call-template>
<xsl:element name="dc:language">
<xsl:text>eng</xsl:text>
</xsl:element>
<xsl:element name="dc:identifier">
<xsl:value-of select="concat($epmcUrlPrefix, //*[local-name()='article-id'][@pub-id-type='pmcid'])" />
</xsl:element>
<xsl:element name="oaf:fulltext">
<xsl:value-of select="concat($epmcUrlPrefix, //*[local-name()='article-id'][@pub-id-type='pmcid'])" />
</xsl:element>
<xsl:element name="oaf:dateAccepted">
<xsl:choose>
<xsl:when test="//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub'] or //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']" >
<xsl:if test="string(number($month)) eq 'NaN'" >
<xsl:value-of select="concat($year, '-', '01', '-', '01')" />
</xsl:if>
<xsl:if test="string(number($month)) != 'NaN'" >
<xsl:value-of select="concat($year, '-', $month, '-', '01')" />
</xsl:if>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="concat(//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='ppub']/*[local-name()='year'], '-01-01')" />
</xsl:otherwise>
</xsl:choose>
</xsl:element>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="distinct-values(//*[local-name()='permissions']/*[local-name()='copyright-statement'])"/>
<xsl:with-param name="targetElement" select="'dc:rights'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="distinct-values(//*[local-name()='permissions']/*[local-name()='license'])"/>
<xsl:with-param name="targetElement" select="'dc:rights'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='fn-group']//*[local-name()='fn']"/>
<xsl:with-param name="targetElement" select="'dc:relation'"/>
</xsl:call-template>
<xsl:call-template name="identifiers">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-id']"/>
</xsl:call-template>
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varFP7FundRefDOI)]]">
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
<oaf:projectid>
<xsl:value-of select="concat($varFP7, ./*[local-name()='award-id'])"/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varH2020FundRefDOI)]]">
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
<oaf:projectid>
<xsl:value-of select="concat($varH2020, ./*[local-name()='award-id'])"/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:element name="oaf:accessrights">
<xsl:text>OPEN</xsl:text>
</xsl:element>
<xsl:element name="dr:CobjCategory">
<xsl:attribute name="type" select="'publication'"/>
<xsl:text>0001</xsl:text>
</xsl:element>
<dc:type>
<xsl:value-of select="//*[local-name() = 'article']/@article-type"/>
</dc:type>
<!-- custom-meta perhaps not used for types, then drop
<xsl:variable name='varTypLst' select="distinct-values((//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type))"/>
<xsl:variable name='varTypLst' select="//*[local-name() = 'article']/@article-type"/>
-->
<!-- perhaps ensure that file indeed exists, e.g. as pdf etc -->
<!--
// reduce load for the big PubMed records by exchanging variables with choose
<xsl:variable name="varRefereedConvt" select="for $i in distinct-values((//*[local-name() = 'article']/@article-type, //oai:setSpec))
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
<xsl:variable name="varRefereedDescp" select="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]/'0001'"/>
<xsl:variable name="varRefereedFnote" select="//*[local-name() = 'article']/*[local-name() = ('back', 'front')]/*[local-name() = ('fn-group', 'notes')][
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*') or
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*review\s*information.*') or
matches(lower-case(.), '.*the\s*peer[\.\-_/\s\(\)]*review\s*history\s*for\s*this\s*article\s*is\s*available\s*at .*') or
matches(lower-case(.), '.*provenance\s*and\s*peer[\.\-_/\s\(\)]*review.*') or
matches(lower-case(.), '.*externally\s*peer[\.\-_/\s\(\)]*reviewed.*') or
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*reviewed\s*by.*') or
matches(lower-case(.), '.*refereed\s*anonymously.*') or
matches(lower-case(.), '.*peer\s*reviewer\s*reports\s*are\s*available.*')
]/'0001'"/>
<xsl:variable name="varRefereedReviw" select="//*[local-name() = ('article-meta', 'app', 'app-group')]/*[local-name() = 'supplementary-material']/*[local-name() = 'media'][
matches(lower-case(.), '.*peer\s*review\s*file.*')]/'0001'"/>
<xsl:variable name="varRefereedReltn" select="//*[local-name() = ('related-article')][./@related-article-type = ('peer-reviewed-article', 'reviewed-article')]/'0002'"/>
<xsl:variable name="varRefereedCtRol" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']
[./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or
./*[local-name() = 'contrib'][./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or ./*[local-name() = 'role' and lower-case(.) = ('reviewer', 'solicited external reviewer')] or ./@contrib-type/lower-case(.) = 'reviewer']]/'0001'"/>
<xsl:variable name="varRefereedVersn" select="//*[local-name() = 'article-meta'][./*[local-name() = 'article-version-alternatives']/*[local-name() = 'article-version' and . = 'preprint'] or ./*[local-name() = 'article-version' and . = 'preprint']]/'0002'"/>
<xsl:variable name="varRefereed" select="($varRefereedConvt, $varRefereedDescp, $varRefereedFnote, $varRefereedReviw, $varRefereedReltn, $varRefereedCtRol, $varRefereedVersn)"/>
<xsl:choose>
<xsl:when test="count($varRefereed[. = '0001']) > 0">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="count($varRefereed[. = '0002']) > 0">
<oaf:refereed>
<xsl:value-of select="'0002'"/>
</oaf:refereed>
</xsl:when>
</xsl:choose>
-->
<xsl:variable name="varRefereedConvt" select="for $i in distinct-values((//*[local-name() = 'article']/@article-type, //oai:setSpec))
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
<xsl:choose>
<xsl:when test="count($varRefereedConvt[. = '0001']) > 0">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = 'article-meta']/*[local-name() = 'article-id'][@pub-id-type='doi'][matches(., '^(https?://(dx\.)?doi.org/)?10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$')]">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = 'article']/*[local-name() = ('back', 'front')]/*[local-name() = ('fn-group', 'notes')][
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*') or
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*review\s*information.*') or
matches(lower-case(.), '.*the\s*peer[\.\-_/\s\(\)]*review\s*history\s*for\s*this\s*article\s*is\s*available\s*at .*') or
matches(lower-case(.), '.*provenance\s*and\s*peer[\.\-_/\s\(\)]*review.*') or
matches(lower-case(.), '.*externally\s*peer[\.\-_/\s\(\)]*reviewed.*') or
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*reviewed\s*by.*') or
matches(lower-case(.), '.*refereed\s*anonymously.*') or
matches(lower-case(.), '.*peer\s*reviewer\s*reports\s*are\s*available.*') or
matches(lower-case(.), '.*\[.*peer[\s\-\._]*review\s*:.*\].*') or
matches(lower-case(.), '.*\[.*referees\s*:.*\].*') or
matches(lower-case(.), '^\s*plagiarism[\s\-\._]check.*') or
matches(lower-case(.), '^\s*peer[\s\-\._]*review.*') or
matches(lower-case(.), '^\s*(open\s*peer[\s\-\._]*|p-)reviewer.*') or
matches(lower-case(.), '^\s*(open\s*peer[\s\-\._]*|p-)review\s*reports?.*')]">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = ('article-meta', 'app', 'app-group')]/*[local-name() = 'supplementary-material']/*[local-name() = 'media'][
matches(lower-case(.), '.*peer\s*review\s*file.*')]">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']
[./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or
./*[local-name() = 'contrib'][./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or ./*[local-name() = 'role' and lower-case(.) = ('reviewer', 'solicited external reviewer')] or ./@contrib-type/lower-case(.) = 'reviewer']]">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="count($varRefereedConvt[. = '0002']) > 0">
<oaf:refereed>
<xsl:value-of select="'0002'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = ('related-article')][./@related-article-type = ('peer-reviewed-article', 'reviewed-article')]">
<oaf:refereed>
<xsl:value-of select="'0002'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = 'article-meta'][./*[local-name() = 'article-version-alternatives']/*[local-name() = 'article-version' and . = 'preprint'] or ./*[local-name() = 'article-version' and . = 'preprint']]">
<oaf:refereed>
<xsl:value-of select="'0002'"/>
</oaf:refereed>
</xsl:when>
</xsl:choose>
<xsl:call-template name="journal">
<xsl:with-param name="journalTitle" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
<xsl:with-param name="issn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='ppub']"/>
<xsl:with-param name="eissn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='epub']"/>
<xsl:with-param name="vol" select="//*[local-name()='article-meta']/*[local-name()='volume']"/>
<xsl:with-param name="issue" select="//*[local-name()='article-meta']/*[local-name()='issue']"/>
<xsl:with-param name="sp" select="//*[local-name()='article-meta']/*[local-name()='fpage']"/>
<xsl:with-param name="ep" select="//*[local-name()='article-meta']/*[local-name()='lpage']"/>
</xsl:call-template>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="$varHostedByName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varHostedById"/>
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:collectedFrom>
<xsl:for-each select="//*[local-name() = 'article']/*[local-name() = ('back', 'front')]/*[local-name() = 'fn-group']/*[local-name() = 'fn'][matches(lower-case(.), 'country(/territory)? of origin:?\s*[A-Za-z\-]+')]">
<oaf:country>
<!--
<xsl:value-of select="TransformationFunction:convertString($tf, replace(lower-case(.), '^(.|\s)*country(/territory)? of origin:?\s+([A-Za-z\-,\(\)]+(\s+[A-Za-z\-,\(\)]+)*)(.|\s)*$', '$3'), 'Countries')"/>
-->
<xsl:value-of select="TransformationFunction:convertString($tf, normalize-space(substring(substring-after(lower-case(.), 'of origin'), 2)), 'Countries')"/>
</oaf:country>
</xsl:for-each>
</metadata>
<xsl:copy-of select="//*[local-name() = 'about']" />
</record>
</xsl:template>
<xsl:template name="allElements">
<xsl:param name="sourceElement"/>
<xsl:param name="targetElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="{$targetElement}">
<xsl:value-of select="normalize-space(.)"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template name="journal">
<xsl:param name="journalTitle"/>
<xsl:param name="issn"/>
<xsl:param name="eissn"/>
<xsl:param name="vol"/>
<xsl:param name="issue"/>
<xsl:param name="sp"/>
<xsl:param name="ep"/>
<xsl:element name="oaf:journal">
<xsl:attribute name="issn">
<xsl:value-of select="normalize-space($issn)"/>
</xsl:attribute>
<xsl:attribute name="eissn">
<xsl:value-of select="normalize-space($eissn)"/>
</xsl:attribute>
<xsl:attribute name="vol">
<xsl:value-of select="normalize-space($vol)"/>
</xsl:attribute>
<xsl:attribute name="iss">
<xsl:value-of select="normalize-space($issue)"/>
</xsl:attribute>
<xsl:attribute name="sp">
<xsl:value-of select="normalize-space($sp)"/>
</xsl:attribute>
<xsl:attribute name="ep">
<xsl:value-of select="normalize-space($ep)"/>
</xsl:attribute>
<xsl:value-of select="normalize-space($journalTitle)"/>
</xsl:element>
</xsl:template>
<xsl:template name="identifiers">
<xsl:param name="sourceElement"/>
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>doi</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type='doi']"/>
</xsl:element>
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>pmc</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type='pmcid']"/>
</xsl:element>
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>pmid</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type='pmid']"/>
</xsl:element>
</xsl:template>
<xsl:template name="authors">
<xsl:param name="sourceElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="dc:creator">
<xsl:if test="./*[local-name()='contrib-id'][@contrib-id-type='orcid']">
<xsl:attribute name="nameIdentifierScheme">
<xsl:text>ORCID</xsl:text>
</xsl:attribute>
<xsl:attribute name="schemeURI">
<xsl:text>http://orcid.org/</xsl:text>
</xsl:attribute>
<xsl:attribute name="nameIdentifier">
<xsl:value-of select="substring-after(./*[local-name()='contrib-id'][@contrib-id-type='orcid'], 'http://orcid.org/')"/>
</xsl:attribute>
</xsl:if>
<!--
<xsl:value-of select="concat(normalize-space(./*[local-name()='name']/*[local-name()='surname']), ', ', normalize-space(./*[local-name()='name']/*[local-name()='given-names']))"/>
-->
<xsl:value-of select="concat(normalize-space(./(*[local-name()='name'], *[local-name()='name-alternatives']/*[local-name()='name'])/*[local-name()='surname']), ', ', normalize-space(./(*[local-name()='name'], *[local-name()='name-alternatives']/*[local-name()='name'])/*[local-name()='given-names']))"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template match="//*[local-name() = 'header']">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate"/>
</xsl:element>
</xsl:copy>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>

View File

@ -0,0 +1,493 @@
<!-- from PROD 2021-06-14 -->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:vocabulary="http://eu/dnetlib/transform/clean"
xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
exclude-result-prefixes="xsl vocabulary dateCleaner"
version="2.0">
<!--
<xsl:param name="varHostedById" select="'opendoar____::908'"/>
<xsl:param name="varHostedByName" select="'Europe PubMed Central'"/>
-->
<xsl:param name="varOfficialName" />
<xsl:param name="varDsType" />
<xsl:param name="varDataSourceId" />
<xsl:param name="varFP7FundRefDOI" select="'10.13039/501100004963'"/>
<xsl:param name="varFP7OtherDOI" select="'10.13039/100011102'"/>
<xsl:param name="varH2020FundRefDOI" select="'10.13039/501100007601'"/>
<xsl:param name="varFP7" select="'corda_______::'"/>
<xsl:param name="varH2020" select="'corda__h2020::'"/>
<xsl:param name="epmcUrlPrefix" select="'http://europepmc.org/articles/'" />
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')"/>
<xsl:param name="index" select="0"/>
<xsl:param name="transDate" select="current-dateTime()"/>
<xsl:variable name="year" select="format-number( ( //*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='year'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='year']), '0000')" />
<xsl:variable name="month" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='month'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='month']), '00')" />
<xsl:variable name="day" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='day'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='day']), '00')" />
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template match="/">
<record>
<xsl:apply-templates select="//*[local-name() = 'header']" />
<metadata>
<xsl:if test="not(//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0])">
<xsl:call-template name="terminate"/>
</xsl:if>
<!-- in journal.fi xml:lang of translated titles is not within the trans-title element but within the surrounding trans-title-group element (which just contains 1 trans-title element) -->
<!--
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']//*[local-name()=('article-title', 'trans-title-group')][string-length(normalize-space(.))> 0]"/>
<xsl:with-param name="targetElement" select="'dc:title'"/>
</xsl:call-template>
-->
<xsl:call-template name="title">
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name()='title-group']//*[local-name()=('article-title', 'trans-title', 'subtitle', 'trans-subtitle')]"/>
</xsl:call-template>
<xsl:call-template name="authors">
<!--
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']/*[local-name() = 'contrib'][@contrib-type='author'][not(exists(child::*:collab))]"/>
-->
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group'][@content-type='author']/*[local-name() = 'contrib']"/>
</xsl:call-template>
<!-- <xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:contributor"/>
<xsl:with-param name="targetElement" select="'dc:contributor'"/>
</xsl:call-template>
-->
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-meta']/*[local-name()=('abstract', 'trans-abstract')]"/>
<xsl:with-param name="targetElement" select="'dc:description'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-categories']//*[local-name()='subject']"/>
<xsl:with-param name="targetElement" select="'dc:subject'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='kwd-group']//*[local-name()='kwd']"/>
<xsl:with-param name="targetElement" select="'dc:subject'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='publisher']/*[local-name()='publisher-name']"/>
<xsl:with-param name="targetElement" select="'dc:publisher'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
<xsl:with-param name="targetElement" select="'dc:source'"/>
</xsl:call-template>
<xsl:element name="dc:language">
<xsl:value-of select="//*[local-name()='metadata']//*[local-name()='article']/@xml:lang" />
</xsl:element>
<xsl:element name="dc:identifier">
<xsl:value-of select="//*[local-name()='article-meta']/*[local-name()='self-uri'][contains(./@xlink:href, '/view/')]/@xlink:href" />
</xsl:element>
<xsl:element name="oaf:dateAccepted">
<!--
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day'])" />
<xsl:value-of select="TransformationFunction:Convert($tf, //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub'], 'DateISO8601', 'yyyy-MM-dd', 'min()')" />
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/replace(concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day']), '-(\d)([-$])', '-0$1$2')" />
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/
concat(./*[local-name()='year'], '-',
substring(concat('0', ./*[local-name()='month'], '1'), string-length(./*[local-name()='month']), 2), '-',
substring(concat('0', ./*[local-name()='day'], '1'), string-length(./*[local-name()='day']), 2))" />
-->
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/
concat(./*[local-name()='year'], '-',
substring(concat('0', ./*[local-name()='month'], '1'), string-length(./*[local-name()='month']) idiv 2 + 1, 2), '-',
substring(concat('0', ./*[local-name()='day'], '1'), string-length(./*[local-name()='day']) idiv 2 +1, 2))" />
</xsl:element>
<xsl:for-each select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub']">
<xsl:choose>
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4] and ./*[local-name()='month' and string-length(normalize-space(.)) = 2] and ./*[local-name()='day' and string-length(normalize-space(.)) = 2]">
<dc:date>
<xsl:value-of select="concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day'])"/>
</dc:date>
</xsl:when>
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4] and ./*[local-name()='month' and string-length(normalize-space(.)) = 2]">
<dc:date>
<xsl:value-of select="concat(./*[local-name()='year'], '-', ./*[local-name()='month'])"/>
</dc:date>
</xsl:when>
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4]">
<dc:date>
<xsl:value-of select="./*[local-name()='year']"/>
</dc:date>
</xsl:when>
</xsl:choose>
</xsl:for-each>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()='meta-value'], //*[local-name()='permissions']/*[local-name()='copyright-statement']"/>
<xsl:with-param name="targetElement" select="'dc:rights'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='permissions']/*[local-name()='license']/@xlink:href"/>
<xsl:with-param name="targetElement" select="'oaf:license'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='fn-group']//*[local-name()='fn']"/>
<xsl:with-param name="targetElement" select="'dc:relation'"/>
</xsl:call-template>
<xsl:call-template name="identifiers">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-id']"/>
</xsl:call-template>
<xsl:for-each select="//*[local-name()='article-meta']/*[local-name()='self-uri'][not(./@content-type = 'application/pdf')]/@xlink:href">
<oaf:identifier>
<xsl:attribute name="identifierType">
<xsl:text>landingPage</xsl:text>
</xsl:attribute>
<xsl:value-of select="."/>
</oaf:identifier>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='article-meta']/*[local-name()='self-uri' and ./@content-type='application/pdf' and //oaf:datasourceprefix = ('ambientesust', 'qualityinedu')]/@xlink:href/replace(., '/view/', '/download/')">
<oaf:fulltext>
<xsl:value-of select="."/>
</oaf:fulltext>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varFP7FundRefDOI) or ends-with(., $varFP7OtherDOI)]]">
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
<oaf:projectid>
<xsl:value-of select="concat($varFP7, ./*[local-name()='award-id'])"/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varH2020FundRefDOI)]]">
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
<oaf:projectid>
<xsl:value-of select="concat($varH2020, ./*[local-name()='award-id'])"/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<!-- -->
<xsl:variable name='varRights' select="distinct-values((for $i in (
//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'license']/@xlink:href,
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read'
and not( ./@start_date[(xs:date( max( (string(.), '0001-01-01') ) ) gt current-date())])
and not( ./@end_date[(xs:date( max( (string(.), '0001-01-01') ) ) lt current-date())])]/'open',
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read'
and (( ./@start_date[(xs:date( max( (string(.), '0001-01-01') ) ) gt current-date())])
or ( ./@end_date[(xs:date( max( (string(.), '0001-01-01') ) ) lt current-date())]))]/'embargo')
return vocabulary:clean( normalize-space($i), 'dnet:access_modes') "
/>
<!--
and not((xs:date( max( (start_date, '0001-01-01') ) ) gt current-date()))
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read' and and not((xs:date( max( (./@start_date, '0001-01-01') ) ) gt current-date()))]/'open'
-->
<oaf:accessrights>
<xsl:choose>
<xsl:when test="$varRights[. = 'EMBARGO']">
<xsl:value-of select="'EMBARGO'"/>
</xsl:when>
<xsl:when test="$varRights[. != 'UNKNOWN']">
<xsl:value-of select="$varRights[. != 'UNKNOWN'][1]"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$varRights[1]"/>
</xsl:otherwise>
</xsl:choose>
</oaf:accessrights>
<!--
<oaf:accessrights>
<xsl:value-of select="$varRights[1]"/>
</oaf:accessrights>
<xsl:element name="oaf:accessrights">
<xsl:value-of select="(//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'license']/@xlink:href)/TransformationFunction:convertString($tf, ., 'AccessRights')" />
</xsl:element>
-->
<!--
<xsl:element name="dr:CobjCategory">
<xsl:variable name='varCobjCategory' select="TransformationFunction:convertString($tf, //*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()='meta-value'], 'TextTypologies')" />
<xsl:variable name='varSuperType' select="TransformationFunction:convertString($tf, $varCobjCategory, 'SuperTypes')" />
<xsl:attribute name="type" select="$varSuperType"/>
<xsl:value-of select="$varCobjCategory" />
</xsl:element>
<xsl:variable name='varCobjCatLst' select="for $i in (
//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type)
return TransformationFunction:convertString($tf, normalize-space($i), 'TextTypologies')" />
-->
<xsl:variable name='varTypLst' select="distinct-values((//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type))"/>
<xsl:variable name='varCobjCatLst' select="distinct-values((for $i in $varTypLst
return vocabulary:clean( normalize-space($i), 'dnet:dnet:publication_resource')))" />
<xsl:variable name='varCobjSupLst' select="for $i in $varCobjCatLst
return concat($i, '###', vocabulary:clean( normalize-space($i), 'dnet:result_typologies'))" />
<dr:CobjCategory>
<xsl:choose>
<xsl:when test="count($varCobjSupLst[not(substring-after(., '###') = 'other') and not(substring-before(., '###') = ('0038', '0039', '0040'))]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-after(., '###') = 'other') and not(substring-before(., '###') = ('0038', '0039', '0040'))][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:when test="count($varCobjSupLst[not(substring-after(., '###') = 'other')]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-after(., '###') = 'other')][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:when test="count($varCobjSupLst[not(substring-before(., '###') = ('0020', '0000'))]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-before(., '###') = ('0020', '0000'))][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:when test="count($varCobjSupLst[not(substring-before(., '###') = ('0000'))]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-before(., '###') = ('0000'))][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:otherwise>
<xsl:attribute name="type" select="'other'"/>
<xsl:value-of select="'0000'" />
</xsl:otherwise>
</xsl:choose>
</dr:CobjCategory>
<!--
<xsl:for-each select="$varCobjSupLst">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
-->
<xsl:for-each select="$varTypLst">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
<!--
<xsl:for-each select="(//*[local-name()='article']/@article-type, //*[local-name() = 'custom-meta' and ./@specific-use = 'resource-type']/*[local-name() = ('meta-value', 'meta-name')])">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
-->
<oaf:language>
<xsl:value-of select="vocabulary:clean( //*[local-name()='metadata']//*[local-name()='article']/@xml:lang, 'dnet:languages')" />
</oaf:language>
<!-- review status -->
<!-- ToDo:
review status
~ ask Journal.fi to put it elsewhere
~ evaluate article-version (no example found yet)
subject/kwd:
~ handle thesauri (no example found yet)
relations:
~ handle fn (no example found yet)
-->
<!--
<xsl:variable name="varRefereedConvt" select="for $i in (
//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type)
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
-->
<xsl:variable name="varRefereedConvt" select="for $i in ($varTypLst)
return vocabulary:clean( normalize-space($i), 'dnet:review_levels')"/>
<xsl:variable name="varRefereedDescp" select="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]/'0001'"/>
<xsl:variable name="varRefereedSubjt" select="//*[local-name() = 'article-categories' and contains(//dri:recordIdentifier, 'oai:journal.fi')]/*[local-name() = 'subj-group' and ./@subj-group-type='heading']/*[local-name() = 'subject' and . = 'Peer reviewed articles']/'0001'"/>
<xsl:variable name="varRefereed" select="($varRefereedConvt, $varRefereedDescp, $varRefereedSubjt)"/>
<!--
<oaf:refereed>
<xsl:value-of select="$varRefereedDescp"/>
</oaf:refereed>
<oaf:refereed>
<xsl:value-of select="$varRefereed"/>
</oaf:refereed>
<oaf:refereed>
<xsl:value-of select="count($varRefereed[. = '0001']) > 0"/>
</oaf:refereed>
-->
<xsl:choose>
<xsl:when test="count($varRefereed[. = '0001']) > 0">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="count($varRefereed[. = '0002']) > 0">
<oaf:refereed>
<xsl:value-of select="'0002'"/>
</oaf:refereed>
</xsl:when>
</xsl:choose>
<xsl:call-template name="journal">
<xsl:with-param name="journalTitle" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
<xsl:with-param name="issn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='ppub']"/>
<xsl:with-param name="eissn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='epub']"/>
<xsl:with-param name="vol" select="//*[local-name()='article-meta']/*[local-name()='volume']"/>
<xsl:with-param name="issue" select="//*[local-name()='article-meta']/*[local-name()='issue']"/>
<xsl:with-param name="sp" select="//*[local-name()='article-meta']/*[local-name()='fpage']"/>
<xsl:with-param name="ep" select="//*[local-name()='article-meta']/*[local-name()='lpage']"/>
</xsl:call-template>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:collectedFrom>
</metadata>
<xsl:copy-of select="//*[local-name() = 'about']" />
</record>
</xsl:template>
<xsl:template name="allElements">
<xsl:param name="sourceElement"/>
<xsl:param name="targetElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="{$targetElement}">
<xsl:if test="(.[@xml:lang] or ..[@xml:lang]) and $targetElement = ('dc:title', 'dc:description', 'dc:subject')">
<xsl:attribute name="xml:lang">
<xsl:value-of select="(./@xml:lang, ../@xml:lang)[1]"/>
</xsl:attribute>
</xsl:if>
<xsl:value-of select="normalize-space(.)"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template name="title">
<xsl:param name="sourceElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="dc:title">
<xsl:if test=".[@xml:lang] or ..[@xml:lang]">
<xsl:attribute name="xml:lang">
<xsl:value-of select="(./@xml:lang, ../@xml:lang)[1]"/>
</xsl:attribute>
</xsl:if>
<xsl:value-of select="string-join((., ./following-sibling::*[local-name() = ('subtitle', 'trans-subtitle')])/normalize-space(.), ': ')"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template name="journal">
<xsl:param name="journalTitle"/>
<xsl:param name="issn"/>
<xsl:param name="eissn"/>
<xsl:param name="vol"/>
<xsl:param name="issue"/>
<xsl:param name="sp"/>
<xsl:param name="ep"/>
<xsl:element name="oaf:journal">
<xsl:attribute name="issn">
<xsl:value-of select="normalize-space($issn)"/>
</xsl:attribute>
<xsl:attribute name="eissn">
<xsl:value-of select="normalize-space($eissn)"/>
</xsl:attribute>
<xsl:attribute name="vol">
<xsl:value-of select="normalize-space($vol)"/>
</xsl:attribute>
<xsl:attribute name="iss">
<xsl:value-of select="normalize-space($issue)"/>
</xsl:attribute>
<xsl:attribute name="sp">
<xsl:value-of select="normalize-space($sp)"/>
</xsl:attribute>
<xsl:attribute name="ep">
<xsl:value-of select="normalize-space($ep)"/>
</xsl:attribute>
<xsl:value-of select="normalize-space($journalTitle)"/>
</xsl:element>
</xsl:template>
<xsl:template name="identifiers">
<xsl:param name="sourceElement"/>
<xsl:if test="string-length($sourceElement[@pub-id-type='doi']) gt 0">
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>doi</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type='doi']"/>
</xsl:element>
</xsl:if>
</xsl:template>
<xsl:template name="authors">
<xsl:param name="sourceElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="dc:creator">
<xsl:if test="./*[local-name()='contrib-id'][@contrib-id-type='orcid']">
<xsl:attribute name="nameIdentifierScheme">
<xsl:text>ORCID</xsl:text>
</xsl:attribute>
<xsl:attribute name="schemeURI">
<xsl:text>http://orcid.org/</xsl:text>
</xsl:attribute>
<xsl:attribute name="nameIdentifier">
<xsl:value-of select="substring-after(./*[local-name()='contrib-id'][@contrib-id-type='orcid'], 'http://orcid.org/')"/>
</xsl:attribute>
</xsl:if>
<xsl:value-of select="concat(normalize-space(./*[local-name()='name']/*[local-name()='surname']), ', ', normalize-space(./*[local-name()='name']/*[local-name()='given-names']))"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template match="//*[local-name() = 'header']">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate"/>
</xsl:element>
</xsl:copy>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>

View File

@ -0,0 +1,373 @@
<!-- for adaptation , 2021-06-14 PROD -->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:vocabulary="http://eu/dnetlib/transform/clean"
xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
exclude-result-prefixes="xsl vocabulary dateCleaner"
version="2.0">
<xsl:param name="varOfficialName" />
<xsl:param name="varDsType" />
<xsl:param name="varDataSourceId" />
<xsl:output indent="yes" omit-xml-declaration="yes" />
<xsl:param name="varHostedById" select="&apos;opendoar____::908&apos;" />
<xsl:param name="varHostedByName" select="&apos;Europe PubMed Central&apos;" />
<xsl:param name="varFP7FundRefDOI" select="&apos;10.13039/501100004963&apos;" />
<xsl:param name="varH2020FundRefDOI" select="&apos;10.13039/501100007601&apos;" />
<xsl:param name="varFP7" select="&apos;corda_______::&apos;" />
<xsl:param name="varH2020" select="&apos;corda__h2020::&apos;" />
<xsl:param name="epmcUrlPrefix" select="&apos;http://europepmc.org/articles/&apos;" />
<xsl:param name="repoCode" select="substring-before(//*[local-name() = &apos;header&apos;]/*[local-name()=&apos;recordIdentifier&apos;], &apos;:&apos;)" />
<xsl:param name="index" select="0" />
<xsl:param name="transDate" select="current-dateTime()" />
<xsl:variable name="year" select="format-number( ( //*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@pub-type=&apos;epub&apos;]/*[local-name()=&apos;year&apos;] | //*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@date-type=&apos;pub&apos; and @publication-format=&apos;electronic&apos;]/*[local-name()=&apos;year&apos;]), &apos;0000&apos;)" />
<xsl:variable name="month" select="format-number( (//*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@pub-type=&apos;epub&apos;]/*[local-name()=&apos;month&apos;] | //*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@date-type=&apos;pub&apos; and @publication-format=&apos;electronic&apos;]/*[local-name()=&apos;month&apos;]), &apos;00&apos;)" />
<xsl:variable name="day" select="format-number( (//*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@pub-type=&apos;epub&apos;]/*[local-name()=&apos;day&apos;] | //*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@date-type=&apos;pub&apos; and @publication-format=&apos;electronic&apos;]/*[local-name()=&apos;day&apos;]), &apos;00&apos;)" />
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template match="/">
<record>
<xsl:apply-templates select="//*[local-name() = &apos;header&apos;]" />
<metadata>
<xsl:if test="not(//*[local-name() = &apos;article-meta&apos;]//*[local-name()=&apos;article-title&apos;][string-length(normalize-space(.))&gt; 0])">
<xsl:call-template name="terminate" />
</xsl:if>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name() = &apos;article-meta&apos;]//*[local-name()=&apos;article-title&apos;][string-length(normalize-space(.))&gt; 0]" />
<xsl:with-param name="targetElement" select="&apos;dc:title&apos;" />
</xsl:call-template>
<xsl:call-template name="authors">
<!--
<xsl:with-param name="sourceElement" select="//*[local-name() = 'contrib'][@contrib-type='author']"/>
-->
<xsl:with-param name="sourceElement" select="//*[local-name() = &apos;article-meta&apos;]/*[local-name() = &apos;contrib-group&apos;]/*[local-name() = &apos;contrib&apos;][@contrib-type=&apos;author&apos;][not(exists(child::*:collab))][./*[local-name()=&apos;name&apos;] or ./*[local-name()=&apos;name-alternatives&apos;]/*[local-name()=&apos;name&apos;]][string-length(.//*[local-name()=&apos;surname&apos;]) + string-length(.//*[local-name()=&apos;given-names&apos;]) &gt; 0]" />
</xsl:call-template> <!-- <xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:contributor"/>
<xsl:with-param name="targetElement" select="'dc:contributor'"/>
</xsl:call-template>
-->
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;article-meta&apos;]/*[local-name()=&apos;abstract&apos;]" />
<xsl:with-param name="targetElement" select="&apos;dc:description&apos;" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;article-categories&apos;]//*[local-name()=&apos;subject&apos;]" />
<xsl:with-param name="targetElement" select="&apos;dc:subject&apos;" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;kwd-group&apos; and not(lower-case(@kwd-group-type)=(&apos;mesh&apos;, &apos;ocis&apos;))]//*[local-name()=&apos;kwd&apos;]" />
<xsl:with-param name="targetElement" select="&apos;dc:subject&apos;" />
</xsl:call-template>
<xsl:for-each select="//*[local-name()=&apos;kwd-group&apos; and lower-case(@kwd-group-type)=&apos;mesh&apos; and ./*[local-name()=&apos;kwd&apos;]]">
<xsl:for-each select="./*[local-name()=&apos;kwd&apos;]">
<dc:subject>
<xsl:attribute name="subjectScheme" select="&apos;mesh&apos;" />
<xsl:attribute name="schemeURI" select="&apos;http://www.nlm.nih.gov/mesh/&apos;" />
<xsl:attribute name="valueURI" select="&apos;&apos;" />
<xsl:value-of select="./concat(&apos;mesh:&apos;, replace(., &apos;mesh (.*)$&apos;, &apos;$1&apos;))" />
</dc:subject>
</xsl:for-each>
</xsl:for-each>
<xsl:for-each select="//*[local-name()=&apos;kwd-group&apos; and lower-case(@kwd-group-type)=&apos;ocis&apos; and ./*[local-name()=&apos;kwd&apos;]]">
<xsl:for-each select="./*[local-name()=&apos;kwd&apos;]">
<dc:subject>
<xsl:attribute name="subjectScheme" select="&apos;ocis&apos;" />
<xsl:attribute name="schemeURI" select="&apos;&apos;" />
<xsl:attribute name="valueURI" select="&apos;&apos;" />
<xsl:value-of select="./concat(&apos;ocis:&apos;, .)" />
</dc:subject>
</xsl:for-each>
</xsl:for-each>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;publisher&apos;]/*[local-name()=&apos;publisher-name&apos;]" />
<xsl:with-param name="targetElement" select="&apos;dc:publisher&apos;" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;journal-meta&apos;]//*[local-name()=&apos;journal-title&apos;]" />
<xsl:with-param name="targetElement" select="&apos;dc:source&apos;" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name() = &apos;article-meta&apos;]/(*[local-name() = &apos;article-version-alternatives&apos;]/*[local-name() = &apos;article-version&apos;], *[local-name() = &apos;article-version&apos;])/concat(&apos;article-version (&apos;, @article-version-type, &apos;) &apos;, .)" />
<xsl:with-param name="targetElement" select="&apos;dc:source&apos;" />
</xsl:call-template>
<xsl:element name="dc:language">
<xsl:text>eng</xsl:text>
</xsl:element>
<xsl:element name="dc:identifier">
<xsl:value-of select="concat($epmcUrlPrefix, //*[local-name()=&apos;article-id&apos;][@pub-id-type=&apos;pmcid&apos;])" />
</xsl:element>
<xsl:element name="oaf:fulltext">
<xsl:value-of select="concat($epmcUrlPrefix, //*[local-name()=&apos;article-id&apos;][@pub-id-type=&apos;pmcid&apos;])" />
</xsl:element>
<xsl:element name="oaf:dateAccepted">
<xsl:choose>
<xsl:when test="//*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@pub-type=&apos;epub&apos;] or //*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@date-type=&apos;pub&apos; and @publication-format=&apos;electronic&apos;]">
<xsl:if test="string(number($month)) eq &apos;NaN&apos;">
<xsl:value-of select="concat($year, &apos;-&apos;, &apos;01&apos;, &apos;-&apos;, &apos;01&apos;)" />
</xsl:if>
<xsl:if test="string(number($month)) != &apos;NaN&apos;">
<xsl:value-of select="concat($year, &apos;-&apos;, $month, &apos;-&apos;, &apos;01&apos;)" />
</xsl:if>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="concat(//*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@pub-type=&apos;ppub&apos;]/*[local-name()=&apos;year&apos;], &apos;-01-01&apos;)" />
</xsl:otherwise>
</xsl:choose>
</xsl:element>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="distinct-values(//*[local-name()=&apos;permissions&apos;]/*[local-name()=&apos;copyright-statement&apos;])" />
<xsl:with-param name="targetElement" select="&apos;dc:rights&apos;" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="distinct-values(//*[local-name()=&apos;permissions&apos;]/*[local-name()=&apos;license&apos;])" />
<xsl:with-param name="targetElement" select="&apos;dc:rights&apos;" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;fn-group&apos;]//*[local-name()=&apos;fn&apos;]" />
<xsl:with-param name="targetElement" select="&apos;dc:relation&apos;" />
</xsl:call-template>
<xsl:call-template name="identifiers">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;article-id&apos;]" />
</xsl:call-template>
<xsl:for-each select="//*[local-name()=&apos;award-group&apos;][.//*[local-name()=&apos;institution-id&apos;][ends-with(., $varFP7FundRefDOI)]]">
<xsl:if test="./*[local-name()=&apos;award-id&apos;][matches(normalize-space(.), &apos;(^\d\d\d\d\d\d$)&apos;, &apos;i&apos;)]">
<oaf:projectid>
<xsl:value-of select="concat($varFP7, ./*[local-name()=&apos;award-id&apos;])" />
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:for-each select="//*[local-name()=&apos;award-group&apos;][.//*[local-name()=&apos;institution-id&apos;][ends-with(., $varH2020FundRefDOI)]]">
<xsl:if test="./*[local-name()=&apos;award-id&apos;][matches(normalize-space(.), &apos;(^\d\d\d\d\d\d$)&apos;, &apos;i&apos;)]">
<oaf:projectid>
<xsl:value-of select="concat($varH2020, ./*[local-name()=&apos;award-id&apos;])" />
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:element name="oaf:accessrights">
<xsl:text>OPEN</xsl:text>
</xsl:element>
<xsl:element name="dr:CobjCategory">
<xsl:attribute name="type" select="&apos;publication&apos;" />
<xsl:text>0001</xsl:text>
</xsl:element>
<dc:type>
<xsl:value-of select="//*[local-name() = &apos;article&apos;]/@article-type" />
</dc:type>
<xsl:variable name="varRefereedConvt" select="for $i in (//*[local-name() = 'resource']/*[local-name() = ('resourceType', 'version')]/(., @uri))
return vocabulary:clean( normalize-space($i), 'dnet:review_levels')"/>
<!-- <xsl:variable name="varRefereedConvt" select="for $i in distinct-values((//*[local-name() = &apos;article&apos;]/@article-type, //oai:setSpec))
return TransformationFunction:convertString($tf, normalize-space($i), &apos;ReviewLevels&apos;)" />
-->
<xsl:choose>
<xsl:when test="count($varRefereedConvt[. = &apos;0001&apos;]) &gt; 0">
<oaf:refereed>
<xsl:value-of select="&apos;0001&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = &apos;article-meta&apos;]/*[local-name() = &apos;article-id&apos;][@pub-id-type=&apos;doi&apos;][matches(., &apos;^(https?://(dx\.)?doi.org/)?10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$&apos;)]">
<oaf:refereed>
<xsl:value-of select="&apos;0001&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = &apos;article-meta&apos;]/*[local-name() = (&apos;abstract&apos;, &apos;trans-abstract&apos;)][matches(lower-case(.), &apos;^\s*(.p.\s*)?refereed\s*article.*&apos;)]">
<oaf:refereed>
<xsl:value-of select="&apos;0001&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = &apos;article&apos;]/*[local-name() = (&apos;back&apos;, &apos;front&apos;)]/*[local-name() = (&apos;fn-group&apos;, &apos;notes&apos;)][
matches(lower-case(.), &apos;.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*&apos;) or
matches(lower-case(.), &apos;.*peer[\.\-_/\s\(\)]*review\s*information.*&apos;) or
matches(lower-case(.), &apos;.*the\s*peer[\.\-_/\s\(\)]*review\s*history\s*for\s*this\s*article\s*is\s*available\s*at .*&apos;) or
matches(lower-case(.), &apos;.*provenance\s*and\s*peer[\.\-_/\s\(\)]*review.*&apos;) or
matches(lower-case(.), &apos;.*externally\s*peer[\.\-_/\s\(\)]*reviewed.*&apos;) or
matches(lower-case(.), &apos;.*peer[\.\-_/\s\(\)]*reviewed\s*by.*&apos;) or
matches(lower-case(.), &apos;.*refereed\s*anonymously.*&apos;) or
matches(lower-case(.), &apos;.*peer\s*reviewer\s*reports\s*are\s*available.*&apos;) or
matches(lower-case(.), &apos;.*\[.*peer[\s\-\._]*review\s*:.*\].*&apos;) or
matches(lower-case(.), &apos;.*\[.*referees\s*:.*\].*&apos;) or
matches(lower-case(.), &apos;^\s*plagiarism[\s\-\._]check.*&apos;) or
matches(lower-case(.), &apos;^\s*peer[\s\-\._]*review.*&apos;) or
matches(lower-case(.), &apos;^\s*(open\s*peer[\s\-\._]*|p-)reviewer.*&apos;) or
matches(lower-case(.), &apos;^\s*(open\s*peer[\s\-\._]*|p-)review\s*reports?.*&apos;)]">
<oaf:refereed>
<xsl:value-of select="&apos;0001&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = (&apos;article-meta&apos;, &apos;app&apos;, &apos;app-group&apos;)]/*[local-name() = &apos;supplementary-material&apos;]/*[local-name() = &apos;media&apos;][
matches(lower-case(.), &apos;.*peer\s*review\s*file.*&apos;)]">
<oaf:refereed>
<xsl:value-of select="&apos;0001&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = &apos;article-meta&apos;]/*[local-name() = &apos;contrib-group&apos;]
[./@role/lower-case(.) = (&apos;reviewer&apos;, &apos;solicited external reviewer&apos;) or
./*[local-name() = &apos;contrib&apos;][./@role/lower-case(.) = (&apos;reviewer&apos;, &apos;solicited external reviewer&apos;) or ./*[local-name() = &apos;role&apos; and lower-case(.) = (&apos;reviewer&apos;, &apos;solicited external reviewer&apos;)] or ./@contrib-type/lower-case(.) = &apos;reviewer&apos;]]">
<oaf:refereed>
<xsl:value-of select="&apos;0001&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="count($varRefereedConvt[. = &apos;0002&apos;]) &gt; 0">
<oaf:refereed>
<xsl:value-of select="&apos;0002&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = (&apos;related-article&apos;)][./@related-article-type = (&apos;peer-reviewed-article&apos;, &apos;reviewed-article&apos;)]">
<oaf:refereed>
<xsl:value-of select="&apos;0002&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = &apos;article-meta&apos;][./*[local-name() = &apos;article-version-alternatives&apos;]/*[local-name() = &apos;article-version&apos; and . = &apos;preprint&apos;] or ./*[local-name() = &apos;article-version&apos; and . = &apos;preprint&apos;]]">
<oaf:refereed>
<xsl:value-of select="&apos;0002&apos;" />
</oaf:refereed>
</xsl:when>
</xsl:choose>
<xsl:call-template name="journal">
<xsl:with-param name="journalTitle" select="//*[local-name()=&apos;journal-meta&apos;]//*[local-name()=&apos;journal-title&apos;]" />
<xsl:with-param name="issn" select="//*[local-name()=&apos;journal-meta&apos;]/*[local-name()=&apos;issn&apos;][@pub-type=&apos;ppub&apos;]" />
<xsl:with-param name="eissn" select="//*[local-name()=&apos;journal-meta&apos;]/*[local-name()=&apos;issn&apos;][@pub-type=&apos;epub&apos;]" />
<xsl:with-param name="vol" select="//*[local-name()=&apos;article-meta&apos;]/*[local-name()=&apos;volume&apos;]" />
<xsl:with-param name="issue" select="//*[local-name()=&apos;article-meta&apos;]/*[local-name()=&apos;issue&apos;]" />
<xsl:with-param name="sp" select="//*[local-name()=&apos;article-meta&apos;]/*[local-name()=&apos;fpage&apos;]" />
<xsl:with-param name="ep" select="//*[local-name()=&apos;article-meta&apos;]/*[local-name()=&apos;lpage&apos;]" />
</xsl:call-template>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="$varHostedByName" />
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varHostedById" />
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName" />
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId" />
</xsl:attribute>
</oaf:collectedFrom>
<xsl:for-each select="//*[local-name() = &apos;article&apos;]/*[local-name() = (&apos;back&apos;, &apos;front&apos;)]/*[local-name() = &apos;fn-group&apos;]/*[local-name() = &apos;fn&apos;][matches(lower-case(.), &apos;country(/territory)? of origin:?\s*[A-Za-z\-]+&apos;)]">
<oaf:country>
<!--
<xsl:value-of select="TransformationFunction:convertString($tf, replace(lower-case(.), '^(.|\s)*country(/territory)? of origin:?\s+([A-Za-z\-,\(\)]+(\s+[A-Za-z\-,\(\)]+)*)(.|\s)*$', '$3'), 'Countries')"/>
-->
<!-- ACz, 2021-06-14
<xsl:value-of select="TransformationFunction:convertString($tf, normalize-space(substring(substring-after(lower-case(.), &apos;of origin&apos;), 2)), &apos;Countries&apos;)" />
-->
<xsl:value-of select="vocabulary:clean( normalize-space(substring(substring-after(lower-case(.), &apos;of origin&apos;), 2)), 'dnet:countries')"/>
</oaf:country>
</xsl:for-each>
</metadata>
<xsl:copy-of select="//*[local-name() = &apos;about&apos;]" />
</record>
</xsl:template>
<xsl:template name="allElements">
<xsl:param name="sourceElement" />
<xsl:param name="targetElement" />
<xsl:for-each select="$sourceElement">
<xsl:element name="{$targetElement}">
<xsl:value-of select="normalize-space(.)" />
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template name="journal">
<xsl:param name="journalTitle" />
<xsl:param name="issn" />
<xsl:param name="eissn" />
<xsl:param name="vol" />
<xsl:param name="issue" />
<xsl:param name="sp" />
<xsl:param name="ep" />
<xsl:element name="oaf:journal">
<xsl:attribute name="issn">
<xsl:value-of select="normalize-space($issn)" />
</xsl:attribute>
<xsl:attribute name="eissn">
<xsl:value-of select="normalize-space($eissn)" />
</xsl:attribute>
<xsl:attribute name="vol">
<xsl:value-of select="normalize-space($vol)" />
</xsl:attribute>
<xsl:attribute name="iss">
<xsl:value-of select="normalize-space($issue)" />
</xsl:attribute>
<xsl:attribute name="sp">
<xsl:value-of select="normalize-space($sp)" />
</xsl:attribute>
<xsl:attribute name="ep">
<xsl:value-of select="normalize-space($ep)" />
</xsl:attribute>
<xsl:value-of select="normalize-space($journalTitle)" />
</xsl:element>
</xsl:template>
<xsl:template name="identifiers">
<xsl:param name="sourceElement" />
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>doi</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type=&apos;doi&apos;]" />
</xsl:element>
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>pmc</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type=&apos;pmcid&apos;]" />
</xsl:element>
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>pmid</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type=&apos;pmid&apos;]" />
</xsl:element>
</xsl:template>
<xsl:template name="authors">
<xsl:param name="sourceElement" />
<xsl:for-each select="$sourceElement">
<xsl:element name="dc:creator">
<xsl:if test="./*[local-name()=&apos;contrib-id&apos;][@contrib-id-type=&apos;orcid&apos;]">
<xsl:attribute name="nameIdentifierScheme">
<xsl:text>ORCID</xsl:text>
</xsl:attribute>
<xsl:attribute name="schemeURI">
<xsl:text>http://orcid.org/</xsl:text>
</xsl:attribute>
<xsl:attribute name="nameIdentifier">
<xsl:value-of select="substring-after(./*[local-name()=&apos;contrib-id&apos;][@contrib-id-type=&apos;orcid&apos;], &apos;http://orcid.org/&apos;)" />
</xsl:attribute>
</xsl:if> <!--
<xsl:value-of select="concat(normalize-space(./*[local-name()='name']/*[local-name()='surname']), ', ', normalize-space(./*[local-name()='name']/*[local-name()='given-names']))"/>
-->
<xsl:value-of select="concat(normalize-space(./(*[local-name()=&apos;name&apos;], *[local-name()=&apos;name-alternatives&apos;]/*[local-name()=&apos;name&apos;])/*[local-name()=&apos;surname&apos;]), &apos;, &apos;, normalize-space(./(*[local-name()=&apos;name&apos;], *[local-name()=&apos;name-alternatives&apos;]/*[local-name()=&apos;name&apos;])/*[local-name()=&apos;given-names&apos;]))" />
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template match="//*[local-name() = &apos;header&apos;]">
<xsl:copy>
<xsl:apply-templates select="node()|@*" />
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate" />
</xsl:element>
</xsl:copy>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*" />
</xsl:copy>
</xsl:template>
</xsl:stylesheet>

View File

@ -80,6 +80,7 @@ public class GenerateStatsJob {
.map(
(MapFunction<Tuple2<String, DatasourceStats>, DatasourceStats>) t -> t._2,
Encoders.bean(DatasourceStats.class))
.coalesce(1)
.write()
.mode(SaveMode.Overwrite)
.jdbc(dbUrl, "oa_datasource_stats_temp", connectionProperties);

View File

@ -38,6 +38,9 @@ object DoiBoostMappingUtil {
val OPENAIRE_PREFIX = "openaire____"
val SEPARATOR = "::"
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
val DOI_PREFIX = "10."
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
def toActionSet(item:Oaf) :(String, String) = {
@ -352,5 +355,28 @@ object DoiBoostMappingUtil {
}
def isEmpty(x: String) = x == null || x.trim.isEmpty
def normalizeDoi(input : String) :String ={
if(input == null)
return null
val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
if (isEmpty(replaced))
return null
if(replaced.indexOf("10.") < 0)
return null
val ret = replaced.substring(replaced.indexOf("10."))
if (!ret.startsWith(DOI_PREFIX))
return null
return ret
}
}

View File

@ -17,6 +17,8 @@ import scala.collection.mutable
import scala.util.matching.Regex
import java.util
import eu.dnetlib.doiboost.DoiBoostMappingUtil
case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
case class mappingAffiliation(name: String) {}
@ -87,7 +89,7 @@ case object Crossref2Oaf {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
//MAPPING Crossref DOI into PID
val doi: String = (json \ "DOI").extract[String]
val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String])
result.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
//MAPPING Crossref DOI into OriginalId
@ -99,6 +101,7 @@ case object Crossref2Oaf {
val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava)
result.setOriginalId(originalIds)
// Add DataInfo
result.setDataInfo(generateDataInfo())
@ -140,7 +143,6 @@ case object Crossref2Oaf {
result.setDateofacceptance(asField(issuedDate))
}
else {
// TODO: take the oldest date between publishedPrint and publishedOnline
result.setDateofacceptance(asField(createdDate.getValue))
}
result.setRelevantdate(List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate).filter(p => p != null).asJava)
@ -408,14 +410,6 @@ case object Crossref2Oaf {
}
def extractDump(input:String):List[String] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val a = (json \ "items").extract[JArray]
a.arr.map(s => compact(render(s)))
}
def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
val containerTitles = for {JString(ct) <- json \ "container-title"} yield ct

View File

@ -1,6 +1,7 @@
package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.{IntWritable, Text}
import org.apache.spark.SparkConf
@ -21,7 +22,7 @@ object CrossrefDataset {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val ts:Long = (json \ "indexed" \ "timestamp").extract[Long]
val doi:String = (json \ "DOI").extract[String]
val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String])
CrossrefDT(doi, input, ts)
}

View File

@ -1,6 +1,7 @@
package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.crossref.CrossrefDataset.to_item
import eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries.getClass
import org.apache.hadoop.io.{IntWritable, Text}
@ -27,7 +28,7 @@ object GenerateCrossrefDataset {
def crossrefElement(meta: String): CrossrefDT = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(meta)
val doi:String = (json \ "DOI").extract[String]
val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String])
val timestamp: Long = (json \ "indexed" \ "timestamp").extract[Long]
CrossrefDT(doi, meta, timestamp)

View File

@ -196,8 +196,8 @@ case object ConversionUtil {
val authors = inputParams._2
val pub = new Publication
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
pub.setPid(List(createSP(paper.Doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi).asJava)
//IMPORTANT
//The old method result.setId(generateIdentifier(result, doi))
@ -258,11 +258,14 @@ case object ConversionUtil {
val description = inputParams._2
val pub = new Publication
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
pub.setPid(List(createSP(paper.Doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi).asJava)
//Set identifier as 50 | doiboost____::md5(DOI)
pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase))
//IMPORTANT
//The old method result.setId(generateIdentifier(result, doi))
//will be replaced using IdentifierFactory
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)

View File

@ -2,6 +2,7 @@ package eu.dnetlib.doiboost.mag
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
@ -12,6 +13,23 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
object SparkProcessMAG {
def getDistinctResults (d:Dataset[MagPapers]):Dataset[MagPapers]={
d.where(col("Doi").isNotNull)
.groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING)
.reduceGroups((p1:MagPapers,p2:MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1,p2))
.map(_._2)(Encoders.product[MagPapers])
.map(mp => {
new MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi),
mp.DocType, mp.PaperTitle, mp.OriginalTitle,
mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String,
mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId,
mp.Volume, mp.Issue, mp.FirstPage, mp.LastPage,
mp.ReferenceCount, mp.CitationCount, mp.EstimatedCitation,
mp.OriginalVenue, mp.FamilyId, mp.CreatedDate)
})(Encoders.product[MagPapers])
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
@ -33,17 +51,11 @@ object SparkProcessMAG {
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
logger.info("Phase 1) make uninque DOI in Papers:")
logger.info("Phase 1) make uninue DOI in Papers:")
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
val result: RDD[MagPapers] = d.where(col("Doi").isNotNull)
.rdd
.map{ p: MagPapers => Tuple2(p.Doi, p) }
.reduceByKey((p1:MagPapers,p2:MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1,p2))
.map(_._2)
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
val distinctPaper: Dataset[MagPapers] = getDistinctResults(d)
distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct")

View File

@ -84,7 +84,7 @@ object ORCIDToOAF {
JField("type", JString(typeValue)) <- extIds
JField("value", JString(value)) <- extIds
if "doi".equalsIgnoreCase(typeValue)
} yield (typeValue, value)
} yield (typeValue, DoiBoostMappingUtil.normalizeDoi(value))
if (doi.nonEmpty) {
return doi.map(l =>OrcidWork(oid, l._2))
}
@ -102,7 +102,7 @@ object ORCIDToOAF {
def convertTOOAF(input:ORCIDItem) :Publication = {
val doi = input.doi
val pub:Publication = new Publication
pub.setPid(List(createSP(doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setDataInfo(generateDataInfo())
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))

View File

@ -3,6 +3,7 @@ package eu.dnetlib.doiboost.uw
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Instance, OpenAccessRoute, Publication}
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
@ -53,7 +54,10 @@ object UnpayWallToOAF {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val doi = (json \"doi").extract[String]
val doi = DoiBoostMappingUtil.normalizeDoi((json \"doi").extract[String])
if(doi == null)
return null
val is_oa = (json\ "is_oa").extract[Boolean]

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.doiboost
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.junit.jupiter.api.Test
class NormalizeDOITest {
@Test
def doiDSLowerCase():Unit = {
val doi ="10.1042/BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi).equals(doi.toLowerCase()))
}
@Test
def doiFiltered():Unit = {
val doi = "0.1042/BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi) == null)
}
@Test
def doiFiltered2():Unit = {
val doi = "https://doi.org/0.1042/BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi) == null)
}
@Test
def doiCleaned():Unit = {
val doi = "https://doi.org/10.1042/BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi).equals("10.1042/BCJ20160876".toLowerCase()))
}
@Test
def doiCleaned1():Unit = {
val doi = "https://doi.org/10.1042/ BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi).equals("10.1042/BCJ20160876".toLowerCase()))
}
}

View File

@ -461,5 +461,37 @@ class CrossrefMappingTest {
// })
}
@Test
def testNormalizeDOI(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString
val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
val json = template.replace("%s", line)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Publication])
val result: Result = items.head.asInstanceOf[Publication]
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
assertTrue(result.getPid.size() == 1)
result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())))
}
@Test
def testNormalizeDOI2(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("article.json")).mkString
val resultList: List[Oaf] = Crossref2Oaf.convert(template)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Publication])
val result: Result = items.head.asInstanceOf[Publication]
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
assertTrue(result.getPid.size() == 1)
result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())))
}
}

View File

@ -4,7 +4,7 @@ import java.sql.Timestamp
import eu.dnetlib.dhp.schema.oaf.Publication
import org.apache.htrace.fasterxml.jackson.databind.SerializationFeature
import org.apache.spark.SparkConf
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
@ -62,6 +62,55 @@ class MAGMappingTest {
logger.debug(description)
}
@Test
def normalizeDoiTest():Unit = {
import org.json4s.jackson.Serialization.write
import org.json4s.DefaultFormats
implicit val formats = DefaultFormats
val conf = new SparkConf().setAppName("test").setMaster("local[2]")
val sc = new SparkContext(conf)
val spark = SparkSession.builder.config(sc.getConf).getOrCreate()
val path = getClass.getResource("magPapers.json").getPath
import org.apache.spark.sql.Encoders
val schema = Encoders.product[MagPapers].schema
import spark.implicits._
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
assertTrue(ret.count == 10)
ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
spark.close()
}
@Test
def normalizeDoiTest2():Unit = {
import org.json4s.jackson.Serialization.write
import org.json4s.DefaultFormats
implicit val formats = DefaultFormats
val conf = new SparkConf().setAppName("test").setMaster("local[2]")
val sc = new SparkContext(conf)
val spark = SparkSession.builder.config(sc.getConf).getOrCreate()
val path = getClass.getResource("duplicatedMagPapers.json").getPath
import org.apache.spark.sql.Encoders
val schema = Encoders.product[MagPapers].schema
import spark.implicits._
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
assertTrue(ret.count == 8)
ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
spark.close()
//ret.take(8).foreach(mp => println(write(mp)))
}
}

View File

@ -12,6 +12,8 @@ import org.slf4j.{Logger, LoggerFactory}
import java.nio.file.Path
import scala.io.Source
import scala.collection.JavaConversions._
class MappingORCIDToOAFTest {
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
val mapper = new ObjectMapper()
@ -63,9 +65,26 @@ class MappingORCIDToOAFTest {
}
@Test
def testExtractDat1():Unit ={
val aList: List[OrcidAuthor] = List(OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null ),
OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null ), OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null ))
val orcid:ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList)
val oaf = ORCIDToOAF.convertTOOAF(orcid)
assert(oaf.getPid.size() == 1)
oaf.getPid.toList.foreach(pid => assert(pid.getQualifier.getClassid.equals("doi")))
oaf.getPid.toList.foreach(pid => assert(pid.getValue.equals("10.1042/BCJ20160876".toLowerCase())))
//println(mapper.writeValueAsString(ORCIDToOAF.convertTOOAF(orcid)))
}
}

View File

@ -20,16 +20,26 @@ class UnpayWallMappingTest {
val Ilist = Source.fromInputStream(getClass.getResourceAsStream("input.json")).mkString
var i:Int = 0
for (line <-Ilist.lines) {
val p = UnpayWallToOAF.convertToOAF(line)
if(p!= null) {
assertTrue(p.getInstance().size()==1)
if (i== 0){
assertTrue(p.getPid.get(0).getValue.equals("10.1038/2211089b0"))
}
if (i== 1){
assertTrue(p.getPid.get(0).getValue.equals("10.1021/acs.bioconjchem.8b00058.s001"))
}
if (i== 2){
assertTrue(p.getPid.get(0).getValue.equals("10.1021/acs.bioconjchem.8b00086.s001"))
}
logger.info(s"ID : ${p.getId}")
}
assertNotNull(line)
assertTrue(line.nonEmpty)
i = i+1
}
@ -39,7 +49,9 @@ class UnpayWallMappingTest {
val item = UnpayWallToOAF.convertToOAF(l)
assertEquals(item.getInstance().get(0).getAccessright.getOpenAccessRoute, OpenAccessRoute.bronze)
logger.info(mapper.writeValueAsString(item))
}
}

View File

@ -1,5 +1,5 @@
{
"DOI": "10.26850/1678-4618eqj.v35.1.2010.p41-46",
"DOI": " 10.26850/1678-4618eqj.v35.1.2010.p41-46",
"issued": {
"date-parts": [
[

View File

@ -1,5 +1,5 @@
{
"DOI": "10.26850/1678-4618eqj.v35.1.2010.p41-46",
"DOI": "10.26850/1678-4618EQJ.v35.1.2010.p41-46",
"issued": {
"date-parts": [
[

View File

@ -0,0 +1,10 @@
[{"PaperId":2866429360,"Rank":1,"Doi":"10.5465/AMBPP.2018.12619SYMPOSIUM","DocType":null,"PaperTitle":"new directions in research on conflict dynamics","OriginalTitle":"New Directions in Research on Conflict Dynamics","BookTitle":null,"Year":2018,"Date":"2018-07-09T00:00:00Z","Publisher":"Academy of Management Briarcliff Manor, NY 10510","JournalId":null,"Volume":"2018","Issue":"1","FirstPage":"12619","LastPage":null,"ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Academy of Management Proceedings","CreatedDate":"2018-07-19T00:00:00Z"},
{"PaperId":2871494677,"Rank":2,"Doi":"10.1007/978-981-10-8971-8_33","DocType":null,"PaperTitle":"wild flame detection using weight adaptive particle filter from monocular video","OriginalTitle":"Wild Flame Detection Using Weight Adaptive Particle Filter from Monocular Video","BookTitle":null,"Year":2019,"Date":"2019-01-01T00:00:00Z","Publisher":"Springer, Singapore","JournalId":null,"Volume":null,"Issue":null,"FirstPage":"357","LastPage":"365","ReferenceCount":14,"CitationCount":1,"EstimatedCitation":1,"OriginalVenue":null,"CreatedDate":"2018-07-19T00:00:00Z"},
{"PaperId":2883520096,"Rank":3,"Doi":"10.5465/AMBPP .2018.12619SYMPOSIUM","DocType":"Journal","PaperTitle":"elaboracion de un corpus cacografico desde la disponibilidad lexica en estudiantes sevillanos un analisis para la ensenanza de la lengua","OriginalTitle":"Elaboración de un corpus cacográfico desde la disponibilidad léxica en estudiantes sevillanos. Un análisis para la enseñanza de la lengua","BookTitle":null,"Year":2018,"Date":"2018-07-13T00:00:00Z","Publisher":"Poli papers","JournalId":2738339871,"Volume":"13","Issue":"1","FirstPage":"119","LastPage":"131","ReferenceCount":28,"CitationCount":2,"EstimatedCitation":2,"OriginalVenue":"Revista de Lingüística y Lenguas Aplicadas","CreatedDate":"2018-08-03T00:00:00Z"},
{"PaperId":2883800636,"Rank":4,"Doi":"10.1007/978-3-319-92513-4_4","DocType":null,"PaperTitle":"cognitive advantage of bilingualism and its criticisms","OriginalTitle":"Cognitive Advantage of Bilingualism and Its Criticisms","BookTitle":null,"Year":2018,"Date":"2018-01-01T00:00:00Z","Publisher":"Springer, Cham","JournalId":null,"Volume":null,"Issue":null,"FirstPage":"67","LastPage":"89","ReferenceCount":74,"CitationCount":1,"EstimatedCitation":1,"OriginalVenue":null,"CreatedDate":"2018-08-03T00:00:00Z"},
{"PaperId":2885023064,"Rank":5,"Doi":"10.1097/NNA.0000000000000647","DocType":"Journal","PaperTitle":"enhancing and advancing shared governance through a targeted decision making redesign","OriginalTitle":"Enhancing and Advancing Shared Governance Through a Targeted Decision-Making Redesign.","BookTitle":null,"Year":2018,"Date":"2018-09-01T00:00:00Z","Publisher":"J Nurs Adm","JournalId":194945867,"Volume":"48","Issue":"9","FirstPage":"445","LastPage":"451","ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Journal of Nursing Administration","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2885607541,"Rank":1,"Doi":"10.1007/S10465-018-9283-7","DocType":"Journal","PaperTitle":"dance movement therapists attitudes and actions regarding lgbtqi and gender nonconforming communities","OriginalTitle":"Dance/Movement Therapists Attitudes and Actions Regarding LGBTQI and Gender Nonconforming Communities","BookTitle":null,"Year":2018,"Date":"2018-08-07T00:00:00Z","Publisher":"Springer US","JournalId":104993962,"Volume":"40","Issue":"2","FirstPage":"202","LastPage":"223","ReferenceCount":40,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"American Journal of Dance Therapy","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2886182429,"Rank":2,"Doi":"10.13039/501100003329","DocType":null,"PaperTitle":"caracteres de adaptacion en judia comun phaseolus vulgaris l aproximacion genetica e identificacion de qtls","OriginalTitle":"Caracteres de adaptación en judía común (Phaseolus vulgaris L.): aproximación genética e identificación de QTLs","BookTitle":null,"Year":2017,"Date":"2017-06-15T00:00:00Z","Publisher":"CSIC - Misión Biológica de Galicia (MBG)","JournalId":null,"Volume":null,"Issue":null,"FirstPage":null,"LastPage":null,"ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":null,"CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2887149460,"Rank":3,"Doi":"10.1093/FEMSLE/FNY192","DocType":"Journal","PaperTitle":"small extracellular particles with big potential for horizontal gene transfer membrane vesicles and gene transfer agents","OriginalTitle":"Small extracellular particles with big potential for horizontal gene transfer: membrane vesicles and gene transfer agents.","BookTitle":null,"Year":2018,"Date":"2018-10-01T00:00:00Z","Publisher":"Narnia","JournalId":34954451,"Volume":"365","Issue":"19","FirstPage":null,"LastPage":null,"ReferenceCount":124,"CitationCount":13,"EstimatedCitation":13,"OriginalVenue":"Fems Microbiology Letters","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2887446149,"Rank":4,"Doi":"10.5465/ambpp.2018.12619symposium","DocType":"Journal","PaperTitle":"notes from the field toxigenic vibrio cholerae o141 in a traveler to florida nebraska 2017","OriginalTitle":"Notes from the Field: Toxigenic Vibrio cholerae O141 in a Traveler to Florida — Nebraska, 2017","BookTitle":null,"Year":2018,"Date":"2018-08-03T00:00:00Z","Publisher":"Centers for Disease Control MMWR Office","JournalId":183158886,"Volume":"67","Issue":"30","FirstPage":"838","LastPage":"839","ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Morbidity and Mortality Weekly Report","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2889180499,"Rank":5,"Doi":"10.1007/S10924-018-1299-Z","DocType":"Journal","PaperTitle":"hybrid adsorbent materials obtained by the combination of poly ethylene alt maleic anhydride with lignin and lignosulfonate","OriginalTitle":"Hybrid Adsorbent Materials Obtained by the Combination of Poly(ethylene-alt-maleic anhydride) with Lignin and Lignosulfonate","BookTitle":null,"Year":2018,"Date":"2018-08-30T00:00:00Z","Publisher":"Springer US","JournalId":193665811,"Volume":"26","Issue":"11","FirstPage":"4293","LastPage":"4302","ReferenceCount":29,"CitationCount":5,"EstimatedCitation":5,"OriginalVenue":"Journal of Polymers and The Environment","CreatedDate":"2018-09-07T00:00:00Z"}]

View File

@ -0,0 +1,10 @@
[{"PaperId":2866429360,"Rank":1,"Doi":"10.5465/AMBPP.2018.12619SYMPOSIUM","DocType":null,"PaperTitle":"new directions in research on conflict dynamics","OriginalTitle":"New Directions in Research on Conflict Dynamics","BookTitle":null,"Year":2018,"Date":"2018-07-09T00:00:00Z","Publisher":"Academy of Management Briarcliff Manor, NY 10510","JournalId":null,"Volume":"2018","Issue":"1","FirstPage":"12619","LastPage":null,"ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Academy of Management Proceedings","CreatedDate":"2018-07-19T00:00:00Z"},
{"PaperId":2871494677,"Rank":2,"Doi":"10.1007/978-981-10-8971-8_33","DocType":null,"PaperTitle":"wild flame detection using weight adaptive particle filter from monocular video","OriginalTitle":"Wild Flame Detection Using Weight Adaptive Particle Filter from Monocular Video","BookTitle":null,"Year":2019,"Date":"2019-01-01T00:00:00Z","Publisher":"Springer, Singapore","JournalId":null,"Volume":null,"Issue":null,"FirstPage":"357","LastPage":"365","ReferenceCount":14,"CitationCount":1,"EstimatedCitation":1,"OriginalVenue":null,"CreatedDate":"2018-07-19T00:00:00Z"},
{"PaperId":2883520096,"Rank":3,"Doi":"10.4995/RLYLA.2018.9176","DocType":"Journal","PaperTitle":"elaboracion de un corpus cacografico desde la disponibilidad lexica en estudiantes sevillanos un analisis para la ensenanza de la lengua","OriginalTitle":"Elaboración de un corpus cacográfico desde la disponibilidad léxica en estudiantes sevillanos. Un análisis para la enseñanza de la lengua","BookTitle":null,"Year":2018,"Date":"2018-07-13T00:00:00Z","Publisher":"Poli papers","JournalId":2738339871,"Volume":"13","Issue":"1","FirstPage":"119","LastPage":"131","ReferenceCount":28,"CitationCount":2,"EstimatedCitation":2,"OriginalVenue":"Revista de Lingüística y Lenguas Aplicadas","CreatedDate":"2018-08-03T00:00:00Z"},
{"PaperId":2883800636,"Rank":4,"Doi":"10.1007/978-3-319-92513-4_4","DocType":null,"PaperTitle":"cognitive advantage of bilingualism and its criticisms","OriginalTitle":"Cognitive Advantage of Bilingualism and Its Criticisms","BookTitle":null,"Year":2018,"Date":"2018-01-01T00:00:00Z","Publisher":"Springer, Cham","JournalId":null,"Volume":null,"Issue":null,"FirstPage":"67","LastPage":"89","ReferenceCount":74,"CitationCount":1,"EstimatedCitation":1,"OriginalVenue":null,"CreatedDate":"2018-08-03T00:00:00Z"},
{"PaperId":2885023064,"Rank":5,"Doi":"10.1097/NNA.0000000000000647","DocType":"Journal","PaperTitle":"enhancing and advancing shared governance through a targeted decision making redesign","OriginalTitle":"Enhancing and Advancing Shared Governance Through a Targeted Decision-Making Redesign.","BookTitle":null,"Year":2018,"Date":"2018-09-01T00:00:00Z","Publisher":"J Nurs Adm","JournalId":194945867,"Volume":"48","Issue":"9","FirstPage":"445","LastPage":"451","ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Journal of Nursing Administration","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2885607541,"Rank":1,"Doi":"10.1007/S10465-018-9283-7","DocType":"Journal","PaperTitle":"dance movement therapists attitudes and actions regarding lgbtqi and gender nonconforming communities","OriginalTitle":"Dance/Movement Therapists Attitudes and Actions Regarding LGBTQI and Gender Nonconforming Communities","BookTitle":null,"Year":2018,"Date":"2018-08-07T00:00:00Z","Publisher":"Springer US","JournalId":104993962,"Volume":"40","Issue":"2","FirstPage":"202","LastPage":"223","ReferenceCount":40,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"American Journal of Dance Therapy","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2886182429,"Rank":2,"Doi":"10.13039/501100003329","DocType":null,"PaperTitle":"caracteres de adaptacion en judia comun phaseolus vulgaris l aproximacion genetica e identificacion de qtls","OriginalTitle":"Caracteres de adaptación en judía común (Phaseolus vulgaris L.): aproximación genética e identificación de QTLs","BookTitle":null,"Year":2017,"Date":"2017-06-15T00:00:00Z","Publisher":"CSIC - Misión Biológica de Galicia (MBG)","JournalId":null,"Volume":null,"Issue":null,"FirstPage":null,"LastPage":null,"ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":null,"CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2887149460,"Rank":3,"Doi":"10.1093/FEMSLE/FNY192","DocType":"Journal","PaperTitle":"small extracellular particles with big potential for horizontal gene transfer membrane vesicles and gene transfer agents","OriginalTitle":"Small extracellular particles with big potential for horizontal gene transfer: membrane vesicles and gene transfer agents.","BookTitle":null,"Year":2018,"Date":"2018-10-01T00:00:00Z","Publisher":"Narnia","JournalId":34954451,"Volume":"365","Issue":"19","FirstPage":null,"LastPage":null,"ReferenceCount":124,"CitationCount":13,"EstimatedCitation":13,"OriginalVenue":"Fems Microbiology Letters","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2887446149,"Rank":4,"Doi":"10.15585/MMWR.MM6730A7","DocType":"Journal","PaperTitle":"notes from the field toxigenic vibrio cholerae o141 in a traveler to florida nebraska 2017","OriginalTitle":"Notes from the Field: Toxigenic Vibrio cholerae O141 in a Traveler to Florida — Nebraska, 2017","BookTitle":null,"Year":2018,"Date":"2018-08-03T00:00:00Z","Publisher":"Centers for Disease Control MMWR Office","JournalId":183158886,"Volume":"67","Issue":"30","FirstPage":"838","LastPage":"839","ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Morbidity and Mortality Weekly Report","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2889180499,"Rank":5,"Doi":"10.1007/S10924-018-1299-Z","DocType":"Journal","PaperTitle":"hybrid adsorbent materials obtained by the combination of poly ethylene alt maleic anhydride with lignin and lignosulfonate","OriginalTitle":"Hybrid Adsorbent Materials Obtained by the Combination of Poly(ethylene-alt-maleic anhydride) with Lignin and Lignosulfonate","BookTitle":null,"Year":2018,"Date":"2018-08-30T00:00:00Z","Publisher":"Springer US","JournalId":193665811,"Volume":"26","Issue":"11","FirstPage":"4293","LastPage":"4302","ReferenceCount":29,"CitationCount":5,"EstimatedCitation":5,"OriginalVenue":"Journal of Polymers and The Environment","CreatedDate":"2018-09-07T00:00:00Z"}]

View File

@ -1,6 +1,6 @@
{"doi": "10.1038/2211089b0", "year": 1969, "genre": "journal-article", "is_oa": true, "title": "Planning: Trees in Danger", "doi_url": "https://doi.org/10.1038/2211089b0", "updated": "2020-02-06T13:51:15.164623", "oa_status": "bronze", "publisher": "Springer Nature", "z_authors": [{"name": "Our Planning Correspondent"}], "is_paratext": false, "journal_name": "Nature", "oa_locations": [{"url": "http://www.nature.com/articles/2211089b0.pdf", "pmh_id": null, "is_best": true, "license": null, "updated": "2018-07-11T09:19:40.598930", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "http://www.nature.com/articles/2211089b0.pdf", "url_for_landing_page": "https://doi.org/10.1038/2211089b0", "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0028-0836,1476-4687", "journal_issn_l": "0028-0836", "published_date": "1969-03-01", "best_oa_location": {"url": "http://www.nature.com/articles/2211089b0.pdf", "pmh_id": null, "is_best": true, "license": null, "updated": "2018-07-11T09:19:40.598930", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "http://www.nature.com/articles/2211089b0.pdf", "url_for_landing_page": "https://doi.org/10.1038/2211089b0", "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1021/acs.bioconjchem.8b00058.s001", "year": null, "genre": "component", "is_oa": true, "title": "Engineering Reversible CellCell Interactions with Lipid Anchored Prosthetic Receptors", "doi_url": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "updated": "2020-04-04T21:15:41.966773", "oa_status": "bronze", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [{"url": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:13:39.352965", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "url_for_landing_page": null, "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": {"url": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:13:39.352965", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "url_for_landing_page": null, "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1021/acs.bioconjchem.8b00086.s001", "year": null, "genre": "component", "is_oa": true, "title": "Rapid, Stoichiometric, Site-Specific Modification of Aldehyde-Containing Proteins Using a Tandem Knoevenagel-Intra Michael Addition Reaction", "doi_url": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "updated": "2020-04-04T21:24:50.688286", "oa_status": "bronze", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [{"url": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:22:19.694440", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "url_for_landing_page": null, "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": {"url": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:22:19.694440", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "url_for_landing_page": null, "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1038/221 1089b0", "year": 1969, "genre": "journal-article", "is_oa": true, "title": "Planning: Trees in Danger", "doi_url": "https://doi.org/10.1038/2211089b0", "updated": "2020-02-06T13:51:15.164623", "oa_status": "bronze", "publisher": "Springer Nature", "z_authors": [{"name": "Our Planning Correspondent"}], "is_paratext": false, "journal_name": "Nature", "oa_locations": [{"url": "http://www.nature.com/articles/2211089b0.pdf", "pmh_id": null, "is_best": true, "license": null, "updated": "2018-07-11T09:19:40.598930", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "http://www.nature.com/articles/2211089b0.pdf", "url_for_landing_page": "https://doi.org/10.1038/2211089b0", "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0028-0836,1476-4687", "journal_issn_l": "0028-0836", "published_date": "1969-03-01", "best_oa_location": {"url": "http://www.nature.com/articles/2211089b0.pdf", "pmh_id": null, "is_best": true, "license": null, "updated": "2018-07-11T09:19:40.598930", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "http://www.nature.com/articles/2211089b0.pdf", "url_for_landing_page": "https://doi.org/10.1038/2211089b0", "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1021/acs.bioconjchem.8b00058. s001", "year": null, "genre": "component", "is_oa": true, "title": "Engineering Reversible CellCell Interactions with Lipid Anchored Prosthetic Receptors", "doi_url": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "updated": "2020-04-04T21:15:41.966773", "oa_status": "bronze", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [{"url": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:13:39.352965", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "url_for_landing_page": null, "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": {"url": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:13:39.352965", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "url_for_landing_page": null, "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1021/acs.bioconjCHEM.8b00086.s001", "year": null, "genre": "component", "is_oa": true, "title": "Rapid, Stoichiometric, Site-Specific Modification of Aldehyde-Containing Proteins Using a Tandem Knoevenagel-Intra Michael Addition Reaction", "doi_url": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "updated": "2020-04-04T21:24:50.688286", "oa_status": "bronze", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [{"url": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:22:19.694440", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "url_for_landing_page": null, "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": {"url": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:22:19.694440", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "url_for_landing_page": null, "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1192/bjp.89.375.270", "year": 1943, "genre": "journal-article", "is_oa": false, "title": "Unusual Pituitary Activity in a Case of Anorexia Nervosa", "doi_url": "https://doi.org/10.1192/bjp.89.375.270", "updated": "2020-03-09T08:54:12.827623", "oa_status": "closed", "publisher": "Royal College of Psychiatrists", "z_authors": [{"given": "M.", "family": "Reiss", "sequence": "first"}], "is_paratext": false, "journal_name": "Journal of Mental Science", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0368-315X,2514-9946", "journal_issn_l": "0368-315X", "published_date": "1943-04-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1016/s0167-7012(99)00056-1", "year": 1999, "genre": "journal-article", "is_oa": false, "title": "Development of radiographic and microscopic techniques for the characterization of bacterial transport in intact sediment cores from Oyster, Virginia", "doi_url": "https://doi.org/10.1016/s0167-7012(99)00056-1", "updated": "2020-04-05T11:15:40.634599", "oa_status": "closed", "publisher": "Elsevier BV", "z_authors": [{"given": "Hailiang", "family": "Dong", "sequence": "first"}, {"given": "Tullis C.", "family": "Onstott", "sequence": "additional"}, {"given": "Mary F.", "family": "DeFlaun", "sequence": "additional"}, {"given": "Mark E.", "family": "Fuller", "sequence": "additional"}, {"given": "Kathleen M.", "family": "Gillespie", "sequence": "additional"}, {"given": "James K.", "family": "Fredrickson", "sequence": "additional"}], "is_paratext": false, "journal_name": "Journal of Microbiological Methods", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0167-7012", "journal_issn_l": "0167-7012", "published_date": "1999-08-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1086/mp.1905.2.issue-3", "year": 1905, "genre": "journal-issue", "is_oa": false, "title": null, "doi_url": "https://doi.org/10.1086/mp.1905.2.issue-3", "updated": "2020-02-07T15:51:44.560109", "oa_status": "closed", "publisher": "University of Chicago Press", "z_authors": null, "is_paratext": false, "journal_name": "Modern Philology", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0026-8232,1545-6951", "journal_issn_l": "0026-8232", "published_date": "1905-01-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false}
@ -38,7 +38,7 @@
{"doi": "10.1016/s1067-991x(03)70006-6", "year": 2003, "genre": "journal-article", "is_oa": false, "title": "Use of the autolaunch method of dispatching a helicopter", "doi_url": "https://doi.org/10.1016/s1067-991x(03)70006-6", "updated": "2020-03-12T07:24:35.659404", "oa_status": "closed", "publisher": "Elsevier BV", "z_authors": [{"given": "Kathleen S.", "family": "Berns", "sequence": "first"}, {"given": "Jeffery J.", "family": "Caniglia", "sequence": "additional"}, {"given": "Daniel G.", "family": "Hankins", "sequence": "additional"}, {"given": "Scott P.", "family": "Zietlow", "sequence": "additional"}], "is_paratext": false, "journal_name": "Air Medical Journal", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "1067-991X", "journal_issn_l": "1067-991X", "published_date": "2003-05-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1016/j.clinimag.2015.12.002", "year": 2016, "genre": "journal-article", "is_oa": false, "title": "Imaging findings, diagnosis, and clinical outcomes in patients with mycotic aneurysms: single center experience", "doi_url": "https://doi.org/10.1016/j.clinimag.2015.12.002", "updated": "2020-03-12T17:56:16.049536", "oa_status": "closed", "publisher": "Elsevier BV", "z_authors": [{"given": "Amy R.", "family": "Deipolyi", "sequence": "first"}, {"given": "Alexander", "family": "Bailin", "sequence": "additional"}, {"given": "Ali", "family": "Khademhosseini", "sequence": "additional"}, {"ORCID": "http://orcid.org/0000-0003-4984-1778", "given": "Rahmi", "family": "Oklu", "sequence": "additional", "authenticated-orcid": false}], "is_paratext": false, "journal_name": "Clinical Imaging", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0899-7071", "journal_issn_l": "0899-7071", "published_date": "2016-05-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1016/j.biocel.2013.05.012", "year": 2013, "genre": "journal-article", "is_oa": false, "title": "MAVS-mediated host cell defense is inhibited by Borna disease virus", "doi_url": "https://doi.org/10.1016/j.biocel.2013.05.012", "updated": "2020-03-09T20:49:25.975316", "oa_status": "closed", "publisher": "Elsevier BV", "z_authors": [{"given": "Yujun", "family": "Li", "sequence": "first"}, {"given": "Wuqi", "family": "Song", "sequence": "additional"}, {"given": "Jing", "family": "Wu", "sequence": "additional"}, {"given": "Qingmeng", "family": "Zhang", "sequence": "additional"}, {"given": "Junming", "family": "He", "sequence": "additional"}, {"given": "Aimei", "family": "Li", "sequence": "additional"}, {"given": "Jun", "family": "Qian", "sequence": "additional"}, {"given": "Aixia", "family": "Zhai", "sequence": "additional"}, {"given": "Yunlong", "family": "Hu", "sequence": "additional"}, {"given": "Wenping", "family": "Kao", "sequence": "additional"}, {"given": "Lanlan", "family": "Wei", "sequence": "additional"}, {"given": "Fengmin", "family": "Zhang", "sequence": "additional"}, {"given": "Dakang", "family": "Xu", "sequence": "additional"}], "is_paratext": false, "journal_name": "The International Journal of Biochemistry & Cell Biology", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "1357-2725", "journal_issn_l": "1357-2725", "published_date": "2013-08-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1021/acsami.8b01074.s004", "year": null, "genre": "component", "is_oa": false, "title": "Solution Coating of Pharmaceutical Nanothin Films and Multilayer Nanocomposites with Controlled Morphology and Polymorphism", "doi_url": "https://doi.org/10.1021/acsami.8b01074.s004", "updated": "2020-04-04T21:02:07.815195", "oa_status": "closed", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1021/acsami.8b01074 .s004", "year": null, "genre": "component", "is_oa": false, "title": "Solution Coating of Pharmaceutical Nanothin Films and Multilayer Nanocomposites with Controlled Morphology and Polymorphism", "doi_url": "https://doi.org/10.1021/acsami.8b01074.s004", "updated": "2020-04-04T21:02:07.815195", "oa_status": "closed", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false}
{"doi": "10.1093/nar/18.18.5552", "year": 1990, "genre": "journal-article", "is_oa": true, "title": "Nucleotide sequence of LTR-gag region of Rous sarcoma virus adapted to semi-permissive host", "doi_url": "https://doi.org/10.1093/nar/18.18.5552", "updated": "2020-02-07T07:59:06.754183", "oa_status": "green", "publisher": "Oxford University Press (OUP)", "z_authors": [{"given": "Vladimir I.", "family": "Kashuba", "sequence": "first"}, {"given": "Serge V.", "family": "Zubak", "sequence": "additional"}, {"given": "Vadim M.", "family": "Kavsan", "sequence": "additional"}, {"given": "Alla V.", "family": "Rynditch", "sequence": "additional"}, {"given": "Ivo", "family": "Hlozanek", "sequence": "additional"}], "is_paratext": false, "journal_name": "Nucleic Acids Research", "oa_locations": [{"url": "http://europepmc.org/articles/pmc332244?pdf=render", "pmh_id": "oai:pubmedcentral.nih.gov:332244", "is_best": true, "license": null, "updated": "2017-10-22T11:38:23.025497", "version": "publishedVersion", "evidence": "oa repository (via OAI-PMH doi match)", "host_type": "repository", "endpoint_id": "pubmedcentral.nih.gov", "url_for_pdf": "http://europepmc.org/articles/pmc332244?pdf=render", "url_for_landing_page": "http://europepmc.org/articles/pmc332244", "repository_institution": "pubmedcentral.nih.gov"}, {"url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC332244", "pmh_id": null, "is_best": false, "license": null, "updated": "2020-04-24T18:18:02.810779", "version": "publishedVersion", "evidence": "oa repository (via pmcid lookup)", "host_type": "repository", "endpoint_id": null, "url_for_pdf": null, "url_for_landing_page": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC332244", "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0305-1048,1362-4962", "journal_issn_l": "0305-1048", "published_date": "1990-01-01", "best_oa_location": {"url": "http://europepmc.org/articles/pmc332244?pdf=render", "pmh_id": "oai:pubmedcentral.nih.gov:332244", "is_best": true, "license": null, "updated": "2017-10-22T11:38:23.025497", "version": "publishedVersion", "evidence": "oa repository (via OAI-PMH doi match)", "host_type": "repository", "endpoint_id": "pubmedcentral.nih.gov", "url_for_pdf": "http://europepmc.org/articles/pmc332244?pdf=render", "url_for_landing_page": "http://europepmc.org/articles/pmc332244", "repository_institution": "pubmedcentral.nih.gov"}, "journal_is_in_doaj": false, "has_repository_copy": true}
{"doi": "10.1021/acsami.8b01294.s001", "year": null, "genre": "component", "is_oa": true, "title": "Highly Elastic Biodegradable Single-Network Hydrogel for Cell Printing", "doi_url": "https://doi.org/10.1021/acsami.8b01294.s001", "updated": "2020-04-04T22:12:53.813586", "oa_status": "bronze", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [{"url": "https://doi.org/10.1021/acsami.8b01294.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T22:11:06.757648", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acsami.8b01294.s001", "url_for_landing_page": null, "repository_institution": null}, {"url": "http://europepmc.org/articles/pmc5876623?pdf=render", "pmh_id": "oai:pubmedcentral.nih.gov:5876623", "is_best": false, "license": "acs-specific: authorchoice/editors choice usage agreement", "updated": "2020-02-19T13:50:59.876849", "version": "publishedVersion", "evidence": "oa repository (via OAI-PMH title match)", "host_type": "repository", "endpoint_id": "ac9de7698155b820de7", "url_for_pdf": "http://europepmc.org/articles/pmc5876623?pdf=render", "url_for_landing_page": "http://europepmc.org/articles/pmc5876623", "repository_institution": "National Institutes of Health (USA) - US National Library of Medicine"}], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": {"url": "https://doi.org/10.1021/acsami.8b01294.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T22:11:06.757648", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acsami.8b01294.s001", "url_for_landing_page": null, "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": true}
{"doi": "10.1097/scs.0b013e3181ef67ba", "year": 2010, "genre": "journal-article", "is_oa": false, "title": "Anomaly of the Internal Carotid Artery Detected During Tonsillectomy", "doi_url": "https://doi.org/10.1097/scs.0b013e3181ef67ba", "updated": "2020-02-10T19:05:26.462040", "oa_status": "closed", "publisher": "Ovid Technologies (Wolters Kluwer Health)", "z_authors": [{"given": "Serdar", "family": "Ceylan", "sequence": "first"}, {"given": "Serkan", "family": "Salman", "sequence": "additional"}, {"given": "Fatih", "family": "Bora", "sequence": "additional"}], "is_paratext": false, "journal_name": "Journal of Craniofacial Surgery", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "1049-2275", "journal_issn_l": "1049-2275", "published_date": "2010-09-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false}

View File

@ -35,7 +35,7 @@
<configuration>
<args>
<arg>-Xmax-classfile-name</arg>
<arg>140</arg>
<arg>200</arg>
</args>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>

View File

@ -3,8 +3,12 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
@ -98,14 +102,9 @@ public class MergeClaimsApplication {
raw
.joinWith(claim, raw.col("_1").equalTo(claim.col("_1")), "full_outer")
.map(
(MapFunction<Tuple2<Tuple2<String, T>, Tuple2<String, T>>, T>) value -> {
Optional<Tuple2<String, T>> opRaw = Optional.ofNullable(value._1());
Optional<Tuple2<String, T>> opClaim = Optional.ofNullable(value._2());
return opRaw.isPresent()
? opRaw.get()._2()
: opClaim.isPresent() ? opClaim.get()._2() : null;
},
(MapFunction<Tuple2<Tuple2<String, T>, Tuple2<String, T>>, T>) value -> processClaims(
Optional.ofNullable(value._1()),
Optional.ofNullable(value._2())),
Encoders.bean(clazz))
.filter(Objects::nonNull)
.map(
@ -117,6 +116,78 @@ public class MergeClaimsApplication {
.text(outPath);
}
private static <T extends Oaf> T processClaims(Optional<Tuple2<String, T>> opRaw,
Optional<Tuple2<String, T>> opClaim) {
// when both are present
if (opClaim.isPresent() && opRaw.isPresent()) {
T oafClaim = opClaim.get()._2();
if (oafClaim instanceof Result) {
T oafRaw = opRaw.get()._2();
// merge the context lists from both oaf objects ...
final List<Context> context = mergeContexts((Result) oafClaim, (Result) oafRaw);
// ... and set it on the result from the aggregator
((Result) oafRaw).setContext(context);
return oafRaw;
}
}
// otherwise prefer the result from the aggregator
return opRaw.isPresent()
? opRaw.get()._2()
: opClaim.map(Tuple2::_2).orElse(null);
}
private static List<Context> mergeContexts(Result oafClaim, Result oafRaw) {
return new ArrayList<>(
Stream
.concat(
Optional
.ofNullable(oafClaim.getContext())
.map(List::stream)
.orElse(Stream.empty()),
Optional
.ofNullable(oafRaw.getContext())
.map(List::stream)
.orElse(Stream.empty()))
.collect(
Collectors
.toMap(
Context::getId,
c -> c,
(c1, c2) -> {
Context c = new Context();
c.setId(c1.getId());
c
.setDataInfo(
new ArrayList<>(
Stream
.concat(
Optional
.ofNullable(c1.getDataInfo())
.map(List::stream)
.orElse(Stream.empty()),
Optional
.ofNullable(c2.getDataInfo())
.map(List::stream)
.orElse(Stream.empty()))
.collect(
Collectors
.toMap(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""),
d -> d,
(d1, d2) -> d1))
.values()));
return c;
}))
.values());
}
private static <T extends Oaf> Dataset<T> readFromPath(
SparkSession spark, String path, Class<T> clazz) {
return spark

View File

@ -480,38 +480,15 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final String sourceId = createOpenaireId(sourceType, rs.getString("source_id"), false);
final String targetId = createOpenaireId(targetType, rs.getString("target_id"), false);
final Relation r1 = new Relation();
final Relation r2 = new Relation();
if (StringUtils.isNotBlank(validationDate)) {
r1.setValidated(true);
r1.setValidationDate(validationDate);
r2.setValidated(true);
r2.setValidationDate(validationDate);
}
r1.setCollectedfrom(COLLECTED_FROM_CLAIM);
r1.setSource(sourceId);
r1.setTarget(targetId);
r1.setDataInfo(DATA_INFO_CLAIM);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
r2.setCollectedfrom(COLLECTED_FROM_CLAIM);
r2.setSource(targetId);
r2.setTarget(sourceId);
r2.setDataInfo(DATA_INFO_CLAIM);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
Relation r1 = prepareRelation(sourceId, targetId, validationDate);
Relation r2 = prepareRelation(targetId, sourceId, validationDate);
final String semantics = rs.getString("semantics");
switch (semantics) {
case "resultResult_relationship_isRelatedTo":
r1.setRelType(RESULT_RESULT);
r1.setSubRelType(RELATIONSHIP);
r1.setRelClass(IS_RELATED_TO);
r2.setRelType(RESULT_RESULT);
r2.setSubRelType(RELATIONSHIP);
r2.setRelClass(IS_RELATED_TO);
r1 = setRelationSemantic(r1, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO);
r2 = setRelationSemantic(r2, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO);
break;
case "resultProject_outcome_produces":
if (!"project".equals(sourceType)) {
@ -521,13 +498,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
"invalid claim, sourceId: %s, targetId: %s, semantics: %s",
sourceId, targetId, semantics));
}
r1.setRelType(RESULT_PROJECT);
r1.setSubRelType(OUTCOME);
r1.setRelClass(PRODUCES);
r2.setRelType(RESULT_PROJECT);
r2.setSubRelType(OUTCOME);
r2.setRelClass(IS_PRODUCED_BY);
r1 = setRelationSemantic(r1, RESULT_PROJECT, OUTCOME, PRODUCES);
r2 = setRelationSemantic(r2, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY);
break;
case "resultResult_publicationDataset_isRelatedTo":
r1 = setRelationSemantic(r1, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO);
r2 = setRelationSemantic(r2, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO);
break;
default:
throw new IllegalArgumentException("claim semantics not managed: " + semantics);
@ -540,6 +516,27 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
}
}
private Relation prepareRelation(String sourceId, String targetId, String validationDate) {
Relation r = new Relation();
if (StringUtils.isNotBlank(validationDate)) {
r.setValidated(true);
r.setValidationDate(validationDate);
}
r.setCollectedfrom(COLLECTED_FROM_CLAIM);
r.setSource(sourceId);
r.setTarget(targetId);
r.setDataInfo(DATA_INFO_CLAIM);
r.setLastupdatetimestamp(lastUpdateTimestamp);
return r;
}
private Relation setRelationSemantic(Relation r, String relType, String subRelType, String relClass) {
r.setRelType(relType);
r.setSubRelType(subRelType);
r.setRelClass(relClass);
return r;
}
private List<Context> prepareContext(final String id, final DataInfo dataInfo) {
final Context context = new Context();
context.setId(id);

View File

@ -1,7 +1,9 @@
package eu.dnetlib.dhp.sx.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import java.util.Map;
import java.util.Objects;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpDelete;
@ -12,96 +14,104 @@ import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Map;
import java.util.Objects;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class DropAndCreateESIndex {
private static final Logger log = LoggerFactory.getLogger(DropAndCreateESIndex.class);
public static final String STATUS_CODE_TEXT = "status code: {}";
public static final String APPLICATION_JSON = "application/json";
private static final Logger log = LoggerFactory.getLogger(DropAndCreateESIndex.class);
public static final String STATUS_CODE_TEXT = "status code: {}";
public static final String APPLICATION_JSON = "application/json";
public static void main(String[] args) throws Exception {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects.requireNonNull(DropAndCreateESIndex.class
.getResourceAsStream(
"/eu/dnetlib/dhp/sx/provision/dropAndCreateIndex.json"))));
parser.parseArgument(args);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
DropAndCreateESIndex.class
.getResourceAsStream(
"/eu/dnetlib/dhp/sx/provision/dropAndCreateIndex.json"))));
parser.parseArgument(args);
final String index = parser.get("index");
final String index = parser.get("index");
final String cluster = parser.get("cluster");
final String clusterJson = IOUtils
.toString(Objects.requireNonNull(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/cluster.json")));
final String cluster = parser.get("cluster");
final String clusterJson = IOUtils
.toString(
Objects
.requireNonNull(
DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/cluster.json")));
Map<String, String> clusterMap = new ObjectMapper().readValue(clusterJson, Map.class);
Map<String,String> clusterMap = new ObjectMapper().readValue(clusterJson,Map.class );
final String ip = clusterMap.get(cluster).split(",")[0];
final String ip = clusterMap.get(cluster).split(",")[0];
final String url = "http://%s:9200/%s_%s";
try (CloseableHttpClient client = HttpClients.createDefault()) {
final String url = "http://%s:9200/%s_%s";
HttpDelete delete = new HttpDelete(String.format(url, ip, index, "object"));
try(CloseableHttpClient client = HttpClients.createDefault()) {
CloseableHttpResponse response = client.execute(delete);
HttpDelete delete = new HttpDelete(String.format(url, ip, index, "object"));
log.info("deleting Index SUMMARY");
log.info(STATUS_CODE_TEXT, response.getStatusLine());
}
CloseableHttpResponse response = client.execute(delete);
try (CloseableHttpClient client = HttpClients.createDefault()) {
log.info("deleting Index SUMMARY");
log.info(STATUS_CODE_TEXT,response.getStatusLine());
}
HttpDelete delete = new HttpDelete(String.format(url, ip, index, "scholix"));
CloseableHttpResponse response = client.execute(delete);
try(CloseableHttpClient client = HttpClients.createDefault()) {
log.info("deleting Index SCHOLIX");
log.info(STATUS_CODE_TEXT, response.getStatusLine());
}
HttpDelete delete = new HttpDelete(String.format(url, ip, index, "scholix"));
try (CloseableHttpClient client = HttpClients.createDefault()) {
CloseableHttpResponse response = client.execute(delete);
final String summaryConf = IOUtils
.toString(
Objects
.requireNonNull(
DropAndCreateESIndex.class
.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/summary_index.json")));
log.info("deleting Index SCHOLIX");
log.info(STATUS_CODE_TEXT,response.getStatusLine());
}
HttpPut put = new HttpPut(String.format(url, ip, index, "object"));
try(CloseableHttpClient client = HttpClients.createDefault()) {
StringEntity entity = new StringEntity(summaryConf);
put.setEntity(entity);
put.setHeader("Accept", APPLICATION_JSON);
put.setHeader("Content-type", APPLICATION_JSON);
final String summaryConf = IOUtils
.toString(Objects.requireNonNull(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/summary_index.json")));
log.info("creating First Index SUMMARY");
CloseableHttpResponse response = client.execute(put);
log.info(STATUS_CODE_TEXT, response.getStatusLine());
}
try (CloseableHttpClient client = HttpClients.createDefault()) {
final String scholixConf = IOUtils
.toString(
Objects
.requireNonNull(
DropAndCreateESIndex.class
.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/scholix_index.json")));
HttpPut put = new HttpPut(String.format(url, ip, index, "object"));
log.info("creating Index SCHOLIX");
final HttpPut put = new HttpPut(String.format(url, ip, index, "scholix"));
StringEntity entity = new StringEntity(summaryConf);
put.setEntity(entity);
put.setHeader("Accept", APPLICATION_JSON);
put.setHeader("Content-type", APPLICATION_JSON);
final StringEntity entity = new StringEntity(scholixConf);
put.setEntity(entity);
put.setHeader("Accept", APPLICATION_JSON);
put.setHeader("Content-type", APPLICATION_JSON);
log.info("creating First Index SUMMARY");
CloseableHttpResponse response = client.execute(put);
log.info(STATUS_CODE_TEXT,response.getStatusLine());
final CloseableHttpResponse response = client.execute(put);
log.info(STATUS_CODE_TEXT, response.getStatusLine());
}
}
try(CloseableHttpClient client = HttpClients.createDefault()) {
final String scholixConf = IOUtils
.toString(Objects.requireNonNull(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/scholix_index.json")));
log.info("creating Index SCHOLIX");
final HttpPut put = new HttpPut(String.format(url, ip, index, "scholix"));
final StringEntity entity = new StringEntity(scholixConf);
put.setEntity(entity);
put.setHeader("Accept", APPLICATION_JSON);
put.setHeader("Content-type", APPLICATION_JSON);
final CloseableHttpResponse response = client.execute(put);
log.info(STATUS_CODE_TEXT, response.getStatusLine());
}
}
}
}

View File

@ -1,4 +1,6 @@
package eu.dnetlib.dhp.sx.provision;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
@ -16,43 +18,48 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class SparkIndexCollectionOnES {
public static void main(String[] args) throws Exception {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects.requireNonNull(SparkIndexCollectionOnES.class
.getResourceAsStream(
"/eu/dnetlib/dhp/sx/provision/index_on_es.json"))));
parser.parseArgument(args);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
SparkIndexCollectionOnES.class
.getResourceAsStream(
"/eu/dnetlib/dhp/sx/provision/index_on_es.json"))));
parser.parseArgument(args);
SparkConf conf = new SparkConf()
.setAppName(SparkIndexCollectionOnES.class.getSimpleName())
.setMaster(parser.get("master"));
SparkConf conf = new SparkConf()
.setAppName(SparkIndexCollectionOnES.class.getSimpleName())
.setMaster(parser.get("master"));
final String sourcePath = parser.get("sourcePath");
final String index = parser.get("index");
final String idPath = parser.get("idPath");
final String cluster = parser.get("cluster");
final String clusterJson = IOUtils
.toString(Objects.requireNonNull(DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/cluster.json")));
final String sourcePath = parser.get("sourcePath");
final String index = parser.get("index");
final String idPath = parser.get("idPath");
final String cluster = parser.get("cluster");
final String clusterJson = IOUtils
.toString(
Objects
.requireNonNull(
DropAndCreateESIndex.class.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/cluster.json")));
final Map<String, String> clusterMap = new ObjectMapper().readValue(clusterJson, Map.class);
final Map<String, String> clusterMap = new ObjectMapper().readValue(clusterJson, Map.class);
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
try (final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
try (final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
JavaRDD<String> inputRdd = sc.textFile(sourcePath);
JavaRDD<String> inputRdd = sc.textFile(sourcePath);
Map<String, String> esCfg = new HashMap<>();
esCfg.put("es.nodes", clusterMap.get(cluster));
esCfg.put("es.mapping.id", idPath);
esCfg.put("es.batch.write.retry.count", "8");
esCfg.put("es.batch.write.retry.wait", "60s");
esCfg.put("es.batch.size.entries", "200");
esCfg.put("es.nodes.wan.only", "true");
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
}
}
Map<String, String> esCfg = new HashMap<>();
esCfg.put("es.nodes", clusterMap.get(cluster));
esCfg.put("es.mapping.id", idPath);
esCfg.put("es.batch.write.retry.count", "8");
esCfg.put("es.batch.write.retry.wait", "60s");
esCfg.put("es.batch.size.entries", "200");
esCfg.put("es.nodes.wan.only", "true");
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
}
}
}

View File

@ -17,11 +17,11 @@ create table TARGET.result as
union all
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
union all
select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on p.id=rp.project join SOURCE.project_organizations po on po.id=p.id join SOURCE.organization o on o.id=po.organization where rp.id=r.id and o.name in (
'GEORG-AUGUST-UNIVERSITAT GOTTINGEN STIFTUNG OFFENTLICHEN RECHTS',
'ATHINA-EREVNITIKO KENTRO KAINOTOMIAS STIS TECHNOLOGIES TIS PLIROFORIAS, TON EPIKOINONION KAI TIS GNOSIS',
'Consiglio Nazionale delle Ricerche',
'Universidade do Minho') )) foo;
select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on p.id=rp.project join SOURCE.project_organizations po on po.id=p.id where rp.id=r.id and po.organization in (
'openorgs____::759d59f05d77188faee99b7493b46805',
'openorgs____::b84450f9864182c67b8611b5593f4250',
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975',
'openorgs____::eadc8da90a546e98c03f896661a2e4d4') )) foo;
compute stats TARGET.result;
create table TARGET.result_citations as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);