Merge branch 'stable_ids' of code-repo.d4science.org:D-Net/dnet-hadoop into stable_ids

This commit is contained in:
miconis 2021-09-17 11:27:25 +02:00
commit 680bfa490f
270 changed files with 14086 additions and 2397 deletions

View File

@ -21,6 +21,10 @@
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<dependency>
<groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>

View File

@ -1,182 +0,0 @@
package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable;
import java.util.Date;
import java.util.Objects;
import java.util.UUID;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import javax.persistence.Temporal;
import javax.persistence.TemporalType;
@Entity
@Table(name = "mdstores")
public class MDStore implements Serializable {
/** */
private static final long serialVersionUID = 3160530489149700055L;
@Id
@Column(name = "id")
private String id;
@Column(name = "format")
private String format;
@Column(name = "layout")
private String layout;
@Column(name = "interpretation")
private String interpretation;
@Column(name = "datasource_name")
private String datasourceName;
@Column(name = "datasource_id")
private String datasourceId;
@Column(name = "api_id")
private String apiId;
@Column(name = "hdfs_path")
private String hdfsPath;
@Column(name = "creation_date")
@Temporal(TemporalType.TIMESTAMP)
private Date creationDate;
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getFormat() {
return format;
}
public void setFormat(final String format) {
this.format = format;
}
public String getLayout() {
return layout;
}
public void setLayout(final String layout) {
this.layout = layout;
}
public String getInterpretation() {
return interpretation;
}
public void setInterpretation(final String interpretation) {
this.interpretation = interpretation;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(final String datasourceName) {
this.datasourceName = datasourceName;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(final String datasourceId) {
this.datasourceId = datasourceId;
}
public String getApiId() {
return apiId;
}
public void setApiId(final String apiId) {
this.apiId = apiId;
}
public String getHdfsPath() {
return hdfsPath;
}
public void setHdfsPath(final String hdfsPath) {
this.hdfsPath = hdfsPath;
}
public Date getCreationDate() {
return creationDate;
}
public void setCreationDate(final Date creationDate) {
this.creationDate = creationDate;
}
public static MDStore newInstance(
final String format,
final String layout,
final String interpretation,
final String hdfsBasePath) {
return newInstance(format, layout, interpretation, null, null, null, hdfsBasePath);
}
public static MDStore newInstance(
final String format,
final String layout,
final String interpretation,
final String dsName,
final String dsId,
final String apiId,
final String hdfsBasePath) {
final String mdId = "md-" + UUID.randomUUID();
final MDStore md = new MDStore();
md.setId(mdId);
md.setFormat(format);
md.setLayout(layout);
md.setInterpretation(interpretation);
md.setCreationDate(new Date());
md.setDatasourceName(dsName);
md.setDatasourceId(dsId);
md.setApiId(apiId);
md.setHdfsPath(String.format("%s/%s", hdfsBasePath, mdId));
return md;
}
@Override
public String toString() {
return String
.format(
"MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]",
id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate);
}
@Override
public int hashCode() {
return Objects.hash(id);
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (!(obj instanceof MDStore)) {
return false;
}
final MDStore other = (MDStore) obj;
return Objects.equals(id, other.id);
}
}

View File

@ -1,74 +0,0 @@
package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable;
import java.util.Objects;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
@Entity
@Table(name = "mdstore_current_versions")
public class MDStoreCurrentVersion implements Serializable {
/** */
private static final long serialVersionUID = -4757725888593745773L;
@Id
@Column(name = "mdstore")
private String mdstore;
@Column(name = "current_version")
private String currentVersion;
public String getMdstore() {
return mdstore;
}
public void setMdstore(final String mdstore) {
this.mdstore = mdstore;
}
public String getCurrentVersion() {
return currentVersion;
}
public void setCurrentVersion(final String currentVersion) {
this.currentVersion = currentVersion;
}
public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) {
final MDStoreCurrentVersion cv = new MDStoreCurrentVersion();
cv.setMdstore(mdId);
cv.setCurrentVersion(versionId);
return cv;
}
public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) {
return newInstance(v.getMdstore(), v.getId());
}
@Override
public String toString() {
return String.format("MDStoreCurrentVersion [mdstore=%s, currentVersion=%s]", mdstore, currentVersion);
}
@Override
public int hashCode() {
return Objects.hash(currentVersion, mdstore);
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (!(obj instanceof MDStoreCurrentVersion)) {
return false;
}
final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj;
return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore);
}
}

View File

@ -1,140 +0,0 @@
package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable;
import java.util.Date;
import java.util.Objects;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import javax.persistence.Temporal;
import javax.persistence.TemporalType;
@Entity
@Table(name = "mdstore_versions")
public class MDStoreVersion implements Serializable {
/** */
private static final long serialVersionUID = -4763494442274298339L;
@Id
@Column(name = "id")
private String id;
@Column(name = "mdstore")
private String mdstore;
@Column(name = "writing")
private boolean writing;
@Column(name = "readcount")
private int readCount = 0;
@Column(name = "lastupdate")
@Temporal(TemporalType.TIMESTAMP)
private Date lastUpdate;
@Column(name = "size")
private long size = 0;
@Column(name = "hdfs_path")
private String hdfsPath;
public static MDStoreVersion newInstance(final String mdId, final boolean writing, final String hdfsBasePath) {
final MDStoreVersion v = new MDStoreVersion();
final String versionId = mdId + "-" + new Date().getTime();
v.setId(versionId);
v.setMdstore(mdId);
v.setLastUpdate(null);
v.setWriting(writing);
v.setReadCount(0);
v.setSize(0);
v.setHdfsPath(String.format("%s/%s/%s", hdfsBasePath, mdId, versionId));
return v;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getMdstore() {
return mdstore;
}
public void setMdstore(final String mdstore) {
this.mdstore = mdstore;
}
public boolean isWriting() {
return writing;
}
public void setWriting(final boolean writing) {
this.writing = writing;
}
public int getReadCount() {
return readCount;
}
public void setReadCount(final int readCount) {
this.readCount = readCount;
}
public Date getLastUpdate() {
return lastUpdate;
}
public void setLastUpdate(final Date lastUpdate) {
this.lastUpdate = lastUpdate;
}
public long getSize() {
return size;
}
public void setSize(final long size) {
this.size = size;
}
public String getHdfsPath() {
return hdfsPath;
}
public void setHdfsPath(final String hdfsPath) {
this.hdfsPath = hdfsPath;
}
@Override
public String toString() {
return String
.format(
"MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id,
mdstore, writing, readCount, lastUpdate, size, hdfsPath);
}
@Override
public int hashCode() {
return Objects.hash(id);
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (!(obj instanceof MDStoreVersion)) {
return false;
}
final MDStoreVersion other = (MDStoreVersion) obj;
return Objects.equals(id, other.id);
}
}

View File

@ -1,194 +0,0 @@
package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable;
import java.util.Date;
import java.util.Objects;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import javax.persistence.Temporal;
import javax.persistence.TemporalType;
@Entity
@Table(name = "mdstores_with_info")
public class MDStoreWithInfo implements Serializable {
/** */
private static final long serialVersionUID = -8445784770687571492L;
@Id
@Column(name = "id")
private String id;
@Column(name = "format")
private String format;
@Column(name = "layout")
private String layout;
@Column(name = "interpretation")
private String interpretation;
@Column(name = "datasource_name")
private String datasourceName;
@Column(name = "datasource_id")
private String datasourceId;
@Column(name = "api_id")
private String apiId;
@Column(name = "current_version")
private String currentVersion;
@Column(name = "creation_date")
@Temporal(TemporalType.TIMESTAMP)
private Date creationDate;
@Column(name = "lastupdate")
@Temporal(TemporalType.TIMESTAMP)
private Date lastUpdate;
@Column(name = "size")
private long size = 0;
@Column(name = "n_versions")
private long numberOfVersions = 0;
@Column(name = "hdfs_path")
private String hdfsPath;
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getFormat() {
return format;
}
public void setFormat(final String format) {
this.format = format;
}
public String getLayout() {
return layout;
}
public void setLayout(final String layout) {
this.layout = layout;
}
public String getInterpretation() {
return interpretation;
}
public void setInterpretation(final String interpretation) {
this.interpretation = interpretation;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(final String datasourceName) {
this.datasourceName = datasourceName;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(final String datasourceId) {
this.datasourceId = datasourceId;
}
public String getApiId() {
return apiId;
}
public void setApiId(final String apiId) {
this.apiId = apiId;
}
public String getCurrentVersion() {
return currentVersion;
}
public void setCurrentVersion(final String currentVersion) {
this.currentVersion = currentVersion;
}
public Date getCreationDate() {
return creationDate;
}
public void setCreationDate(final Date creationDate) {
this.creationDate = creationDate;
}
public Date getLastUpdate() {
return lastUpdate;
}
public void setLastUpdate(final Date lastUpdate) {
this.lastUpdate = lastUpdate;
}
public long getSize() {
return size;
}
public void setSize(final long size) {
this.size = size;
}
public long getNumberOfVersions() {
return numberOfVersions;
}
public void setNumberOfVersions(final long numberOfVersions) {
this.numberOfVersions = numberOfVersions;
}
public String getHdfsPath() {
return hdfsPath;
}
public void setHdfsPath(final String hdfsPath) {
this.hdfsPath = hdfsPath;
}
@Override
public String toString() {
return String
.format(
"MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]",
id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate,
lastUpdate, size, numberOfVersions, hdfsPath);
}
@Override
public int hashCode() {
return Objects.hash(id);
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (!(obj instanceof MDStoreWithInfo)) {
return false;
}
final MDStoreWithInfo other = (MDStoreWithInfo) obj;
return Objects.equals(id, other.id);
}
}

View File

@ -14,7 +14,7 @@ public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class);
private Connection connection;
private final Connection connection;
public DbClient(final String address, final String login, final String password) {

View File

@ -100,7 +100,7 @@ public class MakeTarArchive implements Serializable {
BufferedInputStream bis = new BufferedInputStream(is);
int count;
byte data[] = new byte[1024];
byte[] data = new byte[1024];
while ((count = bis.read(data, 0, data.length)) != -1) {
ar.write(data, 0, count);
}

View File

@ -13,9 +13,9 @@ import okio.Source;
public class InputStreamRequestBody extends RequestBody {
private InputStream inputStream;
private MediaType mediaType;
private long lenght;
private final InputStream inputStream;
private final MediaType mediaType;
private final long lenght;
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {

View File

@ -21,7 +21,7 @@ public class DNetRestClient {
private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class);
private static ObjectMapper mapper = new ObjectMapper();
private static final ObjectMapper mapper = new ObjectMapper();
public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
final HttpGet httpGet = new HttpGet(url);

View File

@ -34,7 +34,7 @@ public class MessageSender {
private final String workflowId;
private ExecutorService executorService = Executors.newCachedThreadPool();
private final ExecutorService executorService = Executors.newCachedThreadPool();
public MessageSender(final String dnetMessageEndpoint, final String workflowId) {
this.workflowId = workflowId;

View File

@ -0,0 +1,459 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.*;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
public class GraphCleaningFunctions extends CleaningFunctions {
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
public static final int ORCID_LEN = 19;
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
// nothing to clean here
} else if (value instanceof Organization) {
Organization o = (Organization) value;
if (Objects.nonNull(o.getCountry())) {
fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE);
}
} else if (value instanceof Relation) {
// nothing to clean here
} else if (value instanceof Result) {
Result r = (Result) value;
fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES);
fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE);
fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES);
if (Objects.nonNull(r.getSubject())) {
r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES));
}
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES);
fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS);
}
}
if (Objects.nonNull(r.getAuthor())) {
r.getAuthor().stream().filter(Objects::nonNull).forEach(a -> {
if (Objects.nonNull(a.getPid())) {
a.getPid().stream().filter(Objects::nonNull).forEach(p -> {
fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
});
}
});
}
if (value instanceof Publication) {
} else if (value instanceof Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return value;
}
public static <T extends Oaf> boolean filter(T value) {
if (value instanceof Datasource) {
// nothing to evaluate here
} else if (value instanceof Project) {
// nothing to evaluate here
} else if (value instanceof Organization) {
// nothing to evaluate here
} else if (value instanceof Relation) {
// nothing to clean here
} else if (value instanceof Result) {
Result r = (Result) value;
if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) {
return false;
}
if (value instanceof Publication) {
} else if (value instanceof Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return true;
}
public static <T extends Oaf> T cleanup(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
// nothing to clean here
} else if (value instanceof Organization) {
Organization o = (Organization) value;
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
}
} else if (value instanceof Relation) {
Relation r = (Relation) value;
Optional<String> validationDate = doCleanDate(r.getValidationDate());
if (validationDate.isPresent()) {
r.setValidationDate(validationDate.get());
r.setValidated(true);
} else {
r.setValidationDate(null);
r.setValidated(false);
}
} else if (value instanceof Result) {
Result r = (Result) value;
if (Objects.nonNull(r.getDateofacceptance())) {
Optional<String> date = cleanDateField(r.getDateofacceptance());
if (date.isPresent()) {
r.getDateofacceptance().setValue(date.get());
} else {
r.setDateofacceptance(null);
}
}
if (Objects.nonNull(r.getRelevantdate())) {
r
.setRelevantdate(
r
.getRelevantdate()
.stream()
.filter(Objects::nonNull)
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(sp -> {
sp.setValue(GraphCleaningFunctions.cleanDate(sp.getValue()));
return sp;
})
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
r.setPublisher(null);
}
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
r
.setLanguage(
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
}
if (Objects.nonNull(r.getSubject())) {
r
.setSubject(
r
.getSubject()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getTitle())) {
r
.setTitle(
r
.getTitle()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(
sp -> sp
.getValue()
.toLowerCase()
.replaceAll(TITLE_FILTER_REGEX, "")
.length() > TITLE_FILTER_RESIDUAL_LENGTH)
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getDescription())) {
r
.setDescription(
r
.getDescription()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getPid())) {
r.setPid(processPidCleaning(r.getPid()));
}
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r
.setResourcetype(
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
}
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
if (Objects.nonNull(i.getPid())) {
i.setPid(processPidCleaning(i.getPid()));
}
if (Objects.nonNull(i.getAlternateIdentifier())) {
i.setAlternateIdentifier(processPidCleaning(i.getAlternateIdentifier()));
}
Optional
.ofNullable(i.getPid())
.ifPresent(pid -> {
final Set<StructuredProperty> pids = Sets.newHashSet(pid);
Optional
.ofNullable(i.getAlternateIdentifier())
.ifPresent(altId -> {
final Set<StructuredProperty> altIds = Sets.newHashSet(altId);
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
});
});
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
i
.setAccessright(
accessRight(
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES));
}
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
}
if (Objects.isNull(i.getRefereed())) {
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
}
if (Objects.nonNull(i.getDateofacceptance())) {
Optional<String> date = cleanDateField(i.getDateofacceptance());
if (date.isPresent()) {
i.getDateofacceptance().setValue(date.get());
} else {
i.setDateofacceptance(null);
}
}
}
}
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
if (Objects.isNull(bestaccessrights)) {
r
.setBestaccessright(
qualifier(
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES));
} else {
r.setBestaccessright(bestaccessrights);
}
}
if (Objects.nonNull(r.getAuthor())) {
r
.setAuthor(
r
.getAuthor()
.stream()
.filter(a -> Objects.nonNull(a))
.filter(a -> StringUtils.isNotBlank(a.getFullname()))
.filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
.collect(Collectors.toList()));
boolean nullRank = r
.getAuthor()
.stream()
.anyMatch(a -> Objects.isNull(a.getRank()));
if (nullRank) {
int i = 1;
for (Author author : r.getAuthor()) {
author.setRank(i++);
}
}
for (Author a : r.getAuthor()) {
if (Objects.isNull(a.getPid())) {
a.setPid(Lists.newArrayList());
} else {
a
.setPid(
a
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(p -> Objects.nonNull(p.getQualifier()))
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> {
// hack to distinguish orcid from orcid_pending
String pidProvenance = Optional
.ofNullable(p.getDataInfo())
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
if (p
.getQualifier()
.getClassid()
.toLowerCase()
.contains(ModelConstants.ORCID)) {
if (pidProvenance
.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
p.getQualifier().setClassid(ModelConstants.ORCID);
} else {
p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
}
final String orcid = p
.getValue()
.trim()
.toLowerCase()
.replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4");
if (orcid.length() == ORCID_LEN) {
p.setValue(orcid);
} else {
p.setValue("");
}
}
return p;
})
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.collect(
Collectors
.toMap(
p -> p.getQualifier().getClassid() + p.getValue(),
Function.identity(),
(p1, p2) -> p1,
LinkedHashMap::new))
.values()
.stream()
.collect(Collectors.toList()));
}
}
}
if (value instanceof Publication) {
} else if (value instanceof Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return value;
}
private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
return Optional
.ofNullable(dateofacceptance)
.map(Field::getValue)
.map(GraphCleaningFunctions::cleanDate)
.filter(Objects::nonNull);
}
protected static Optional<String> doCleanDate(String date) {
return Optional.ofNullable(cleanDate(date));
}
public static String cleanDate(final String inputDate) {
if (StringUtils.isBlank(inputDate)) {
return null;
}
try {
final LocalDate date = DateParserUtils
.parseDate(inputDate.trim())
.toInstant()
.atZone(ZoneId.systemDefault())
.toLocalDate();
return DateTimeFormatter.ofPattern(ModelSupport.DATE_FORMAT).format(date);
} catch (DateTimeParseException e) {
return null;
}
}
// HELPERS
private static boolean isValidAuthorName(Author a) {
return !Stream
.of(a.getFullname(), a.getName(), a.getSurname())
.filter(s -> s != null && !s.isEmpty())
.collect(Collectors.joining(""))
.toLowerCase()
.matches(INVALID_AUTHOR_REGEX);
}
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
return pids
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter)
.collect(Collectors.toList());
}
private static void fixVocabName(Qualifier q, String vocabularyName) {
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
q.setSchemeid(vocabularyName);
q.setSchemename(vocabularyName);
}
}
private static AccessRight accessRight(String classid, String classname, String scheme) {
return OafMapperUtils
.accessRight(
classid, classname, scheme, scheme);
}
private static Qualifier qualifier(String classid, String classname, String scheme) {
return OafMapperUtils
.qualifier(
classid, classname, scheme, scheme);
}
protected static StructuredProperty cleanValue(StructuredProperty s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;
}
protected static Field<String> cleanValue(Field<String> s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;
}
}

View File

@ -0,0 +1,368 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
public class OafMapperUtils {
public static Oaf merge(final Oaf left, final Oaf right) {
if (ModelSupport.isSubClass(left, OafEntity.class)) {
return mergeEntities((OafEntity) left, (OafEntity) right);
} else if (ModelSupport.isSubClass(left, Relation.class)) {
((Relation) left).mergeFrom((Relation) right);
} else {
throw new RuntimeException("invalid Oaf type:" + left.getClass().getCanonicalName());
}
return left;
}
public static OafEntity mergeEntities(OafEntity left, OafEntity right) {
if (ModelSupport.isSubClass(left, Result.class)) {
return mergeResults((Result) left, (Result) right);
} else if (ModelSupport.isSubClass(left, Datasource.class)) {
left.mergeFrom(right);
} else if (ModelSupport.isSubClass(left, Organization.class)) {
left.mergeFrom(right);
} else if (ModelSupport.isSubClass(left, Project.class)) {
left.mergeFrom(right);
} else {
throw new RuntimeException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
}
return left;
}
public static Result mergeResults(Result left, Result right) {
if (new ResultTypeComparator().compare(left, right) < 0) {
left.mergeFrom(right);
return left;
} else {
right.mergeFrom(left);
return right;
}
}
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
kv.setValue(v);
return kv;
}
public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) {
throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)");
}
final List<KeyValue> list = new ArrayList<>();
for (int i = 0; i < s.length; i += 2) {
list.add(keyValue(s[i], s[i + 1]));
}
return list;
}
public static <T> Field<T> field(final T value, final DataInfo info) {
if (value == null || StringUtils.isBlank(value.toString())) {
return null;
}
final Field<T> field = new Field<>();
field.setValue(value);
field.setDataInfo(info);
return field;
}
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
return Arrays
.stream(values)
.map(v -> field(v, info))
.filter(Objects::nonNull)
.filter(distinctByKey(f -> f.getValue()))
.collect(Collectors.toList());
}
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
return values
.stream()
.map(v -> field(v, info))
.filter(Objects::nonNull)
.filter(distinctByKey(f -> f.getValue()))
.collect(Collectors.toList());
}
public static Qualifier unknown(final String schemeid, final String schemename) {
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
return accessRight(classid, classname, schemeid, schemename, null);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename,
final OpenAccessRoute openAccessRoute) {
final AccessRight accessRight = new AccessRight();
accessRight.setClassid(classid);
accessRight.setClassname(classname);
accessRight.setSchemeid(schemeid);
accessRight.setSchemename(schemename);
accessRight.setOpenAccessRoute(openAccessRoute);
return accessRight;
}
public static Qualifier qualifier(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
final Qualifier q = new Qualifier();
q.setClassid(classid);
q.setClassname(classname);
q.setSchemeid(schemeid);
q.setSchemename(schemename);
return q;
}
public static Qualifier qualifier(final Qualifier qualifier) {
final Qualifier q = new Qualifier();
q.setClassid(qualifier.getClassid());
q.setClassname(qualifier.getClassname());
q.setSchemeid(qualifier.getSchemeid());
q.setSchemename(qualifier.getSchemename());
return q;
}
public static StructuredProperty structuredProperty(
final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static StructuredProperty structuredProperty(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final StructuredProperty sp = new StructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static ExtraInfo extraInfo(
final String name,
final String value,
final String typology,
final String provenance,
final String trust) {
final ExtraInfo info = new ExtraInfo();
info.setName(name);
info.setValue(value);
info.setTypology(typology);
info.setProvenance(provenance);
info.setTrust(trust);
return info;
}
public static OAIProvenance oaiIProvenance(
final String identifier,
final String baseURL,
final String metadataNamespace,
final Boolean altered,
final String datestamp,
final String harvestDate) {
final OriginDescription desc = new OriginDescription();
desc.setIdentifier(identifier);
desc.setBaseURL(baseURL);
desc.setMetadataNamespace(metadataNamespace);
desc.setAltered(altered);
desc.setDatestamp(datestamp);
desc.setHarvestDate(harvestDate);
final OAIProvenance p = new OAIProvenance();
p.setOriginDescription(desc);
return p;
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final DataInfo dataInfo) {
return hasIssn(issnPrinted, issnOnline, issnLinking) ? journal(
name,
issnPrinted,
issnOnline,
issnLinking,
null,
null,
null,
null,
null,
null,
null,
dataInfo) : null;
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final String ep,
final String iss,
final String sp,
final String vol,
final String edition,
final String conferenceplace,
final String conferencedate,
final DataInfo dataInfo) {
if (StringUtils.isNotBlank(name) || hasIssn(issnPrinted, issnOnline, issnLinking)) {
final Journal j = new Journal();
j.setName(name);
j.setIssnPrinted(issnPrinted);
j.setIssnOnline(issnOnline);
j.setIssnLinking(issnLinking);
j.setEp(ep);
j.setIss(iss);
j.setSp(sp);
j.setVol(vol);
j.setEdition(edition);
j.setConferenceplace(conferenceplace);
j.setConferencedate(conferencedate);
j.setDataInfo(dataInfo);
return j;
} else {
return null;
}
}
private static boolean hasIssn(String issnPrinted, String issnOnline, String issnLinking) {
return StringUtils.isNotBlank(issnPrinted)
|| StringUtils.isNotBlank(issnOnline)
|| StringUtils.isNotBlank(issnLinking);
}
public static DataInfo dataInfo(
final Boolean deletedbyinference,
final String inferenceprovenance,
final Boolean inferred,
final Boolean invisible,
final Qualifier provenanceaction,
final String trust) {
final DataInfo d = new DataInfo();
d.setDeletedbyinference(deletedbyinference);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setInvisible(invisible);
d.setProvenanceaction(provenanceaction);
d.setTrust(trust);
return d;
}
public static String createOpenaireId(
final int prefix,
final String originalId,
final boolean to_md5) {
if (StringUtils.isBlank(originalId)) {
return null;
} else if (to_md5) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, IdentifierFactory.md5(rest));
} else {
return String.format("%s|%s", prefix, originalId);
}
}
public static String createOpenaireId(
final String type,
final String originalId,
final boolean to_md5) {
switch (type) {
case "datasource":
return createOpenaireId(10, originalId, to_md5);
case "organization":
return createOpenaireId(20, originalId, to_md5);
case "person":
return createOpenaireId(30, originalId, to_md5);
case "project":
return createOpenaireId(40, originalId, to_md5);
default:
return createOpenaireId(50, originalId, to_md5);
}
}
public static String asString(final Object o) {
return o == null ? "" : o.toString();
}
public static <T> Predicate<T> distinctByKey(
final Function<? super T, ?> keyExtractor) {
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
}
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<AccessRight> min = instanceList
.stream()
.map(i -> i.getAccessright())
.min(new AccessRightComparator<>());
final Qualifier rights = min.isPresent() ? qualifier(min.get()) : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
}

View File

@ -1,11 +1,11 @@
package eu.dnetlib.dhp.utils;
import java.util.Map;
import javax.xml.ws.BindingProvider;
import org.apache.cxf.endpoint.Client;
import org.apache.cxf.frontend.ClientProxy;
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
import org.apache.cxf.transport.http.HTTPConduit;
import org.apache.cxf.transports.http.configuration.HTTPClientPolicy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -15,8 +15,8 @@ public class ISLookupClientFactory {
private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class);
private static int requestTimeout = 60000 * 10;
private static int connectTimeout = 60000 * 10;
private static final int requestTimeout = 60000 * 10;
private static final int connectTimeout = 60000 * 10;
public static ISLookUpService getLookUpService(final String isLookupUrl) {
return getServiceStub(ISLookUpService.class, isLookupUrl);
@ -31,20 +31,23 @@ public class ISLookupClientFactory {
final T service = (T) jaxWsProxyFactory.create();
if (service instanceof BindingProvider) {
Client client = ClientProxy.getClient(service);
if (client != null) {
HTTPConduit conduit = (HTTPConduit) client.getConduit();
HTTPClientPolicy policy = new HTTPClientPolicy();
log
.info(
"setting timeouts for {} to requestTimeout: {}, connectTimeout: {}",
BindingProvider.class.getName(), requestTimeout, connectTimeout);
String
.format(
"setting connectTimeout to %s, requestTimeout to %s for service %s",
connectTimeout,
requestTimeout,
clazz.getCanonicalName()));
Map<String, Object> requestContext = ((BindingProvider) service).getRequestContext();
requestContext.put("com.sun.xml.internal.ws.request.timeout", requestTimeout);
requestContext.put("com.sun.xml.internal.ws.connect.timeout", connectTimeout);
requestContext.put("com.sun.xml.ws.request.timeout", requestTimeout);
requestContext.put("com.sun.xml.ws.connect.timeout", connectTimeout);
requestContext.put("javax.xml.ws.client.receiveTimeout", requestTimeout);
requestContext.put("javax.xml.ws.client.connectionTimeout", connectTimeout);
policy.setConnectionTimeout(connectTimeout);
policy.setReceiveTimeout(requestTimeout);
conduit.setClient(policy);
}
return service;

View File

@ -0,0 +1,180 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
public class OafMapperUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testDateValidation() {
assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z ").isPresent());
assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent());
assertTrue(GraphCleaningFunctions.doCleanDate(" 2016-04-05").isPresent());
assertEquals("2016-04-05", GraphCleaningFunctions.doCleanDate("2016 Apr 05").get());
assertEquals("2009-05-08", GraphCleaningFunctions.doCleanDate("May 8, 2009 5:57:51 PM").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, '70").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 70").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 MST 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 02 15:04:05 -0700 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Monday, 02-Jan-06 15:04:05 MST").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 MST").get());
assertEquals("2017-07-11", GraphCleaningFunctions.doCleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 -0700").get());
assertEquals("2018-01-04", GraphCleaningFunctions.doCleanDate("Thu, 4 Jan 2018 17:53:36 +0000").get());
assertEquals("2015-08-10", GraphCleaningFunctions.doCleanDate("Mon Aug 10 15:44:11 UTC+0100 2015").get());
assertEquals(
"2015-07-03",
GraphCleaningFunctions.doCleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 10:09am").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 at 10:09am PST-08").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012, 10:10:09").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7th, 1970").get());
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006, 19:17").get());
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006 19:17").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 70").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 1970").get());
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("03 February 2013").get());
assertEquals("2013-07-01", GraphCleaningFunctions.doCleanDate("1 July 2013").get());
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("2013-Feb-03").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3/31/2014").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03/31/2014").get());
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08/21/71").get());
assertEquals("1971-01-08", GraphCleaningFunctions.doCleanDate("8/1/71").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/2014 22:05").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("04/08/2014 22:05").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/14 22:05").get());
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("04/2/2014 03:00:51").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00:00 AM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00:01 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 1:00 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00 AM").get());
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("4/02/2014 03:00:51").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59.3186369").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/3/31").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/03/31").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/4/8 22:05").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/04/08 22:05").get());
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/04/2 03:00:51").get());
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/4/02 03:00:51").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59.3186369").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014年04月08日").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43").get());
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43:22").get());
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 UTC").get());
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 GMT").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 05:24:37 PM").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800 +08").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:44 +09:00").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000 +0000 UTC").get());
assertEquals("2015-09-30", GraphCleaningFunctions.doCleanDate("2015-09-30 18:48:56.35272715 +0000 UTC").get());
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 GMT").get());
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 UTC").get());
assertEquals(
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001").get());
assertEquals(
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001").get());
assertEquals("2017-07-19", GraphCleaningFunctions.doCleanDate("2017-07-19 03:21:51+00:00").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26").get());
assertEquals("2014-04-01", GraphCleaningFunctions.doCleanDate("2014-04").get());
assertEquals("2014-01-01", GraphCleaningFunctions.doCleanDate("2014").get());
assertEquals("2014-05-11", GraphCleaningFunctions.doCleanDate("2014-05-11 08:20:13,787").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3.31.2014").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03.31.2014").get());
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08.21.71").get());
assertEquals("2014-03-01", GraphCleaningFunctions.doCleanDate("2014.03").get());
assertEquals("2014-03-30", GraphCleaningFunctions.doCleanDate("2014.03.30").get());
assertEquals("2014-06-01", GraphCleaningFunctions.doCleanDate("20140601").get());
assertEquals("2014-07-22", GraphCleaningFunctions.doCleanDate("20140722105203").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("1332151919").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367189").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222333").get());
}
@Test
public void testDate() {
System.out.println(GraphCleaningFunctions.cleanDate("23-FEB-1998"));
}
@Test
public void testMergePubs() throws IOException {
Publication p1 = read("publication_1.json", Publication.class);
Publication p2 = read("publication_2.json", Publication.class);
Dataset d1 = read("dataset_1.json", Dataset.class);
Dataset d2 = read("dataset_2.json", Dataset.class);
assertEquals(p1.getCollectedfrom().size(), 1);
assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID);
assertEquals(d2.getCollectedfrom().size(), 1);
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertTrue(
OafMapperUtils
.mergeResults(p1, d2)
.getResulttype()
.getClassid()
.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID));
assertEquals(p2.getCollectedfrom().size(), 1);
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals(d1.getCollectedfrom().size(), 1);
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertTrue(
OafMapperUtils
.mergeResults(p2, d1)
.getResulttype()
.getClassid()
.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID));
}
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new));
}
protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
return OBJECT_MAPPER.readValue(json, clazz);
}
}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository A"} ]}

View File

@ -3,20 +3,23 @@ package eu.dnetlib.dhp.actionmanager;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Triple;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.actionmanager.rmi.ActionManagerException;
import eu.dnetlib.actionmanager.set.ActionManagerSet;
@ -25,6 +28,7 @@ import eu.dnetlib.dhp.actionmanager.partition.PartitionActionSetsByPayloadTypeJo
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
public class ISClient implements Serializable {
@ -40,80 +44,52 @@ public class ISClient implements Serializable {
public List<String> getLatestRawsetPaths(String setIds) {
List<String> ids = Lists
.newArrayList(
final Set<String> ids = Sets
.newHashSet(
Splitter
.on(INPUT_ACTION_SET_ID_SEPARATOR)
.omitEmptyStrings()
.trimResults()
.split(setIds));
return ids
.stream()
.map(id -> getSet(isLookup, id))
.map(as -> as.getPathToLatest())
.collect(Collectors.toCollection(ArrayList::new));
}
private ActionManagerSet getSet(ISLookUpService isLookup, final String setId) {
final String q = "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') "
+ "where $x//SET/@id = '"
+ setId
+ "' return $x";
try {
final String basePath = getBasePathHDFS(isLookup);
final String setProfile = isLookup.getResourceProfileByQuery(q);
return getActionManagerSet(basePath, setProfile);
} catch (ISLookUpException | ActionManagerException e) {
throw new RuntimeException("Error accessing Sets, using query: " + q);
// <SET id="..." directory="..." latest="xxx"/>
final String xquery = "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') "
+
"return <SET id='{$x//SET/@id/string()}' directory='{$x//SET/@directory/string()}' latest='{$x//LATEST/@id/string()}'/>";
return Optional
.ofNullable(isLookup.quickSearchProfile(xquery))
.map(
sets -> sets
.stream()
.map(set -> parseSetInfo(set))
.filter(t -> ids.contains(t.getLeft()))
.map(t -> buildDirectory(basePath, t))
.collect(Collectors.toList()))
.orElseThrow(() -> new IllegalStateException("empty set list"));
} catch (ActionManagerException | ISLookUpException e) {
throw new IllegalStateException("unable to query ActionSets info from the IS");
}
}
private ActionManagerSet getActionManagerSet(final String basePath, final String profile)
throws ActionManagerException {
final SAXReader reader = new SAXReader();
final ActionManagerSet set = new ActionManagerSet();
private Triple<String, String, String> parseSetInfo(String set) {
try {
final Document doc = reader.read(new StringReader(profile));
set.setId(doc.valueOf("//SET/@id").trim());
set.setName(doc.valueOf("//SET").trim());
set.setImpact(ImpactTypes.valueOf(doc.valueOf("//IMPACT").trim()));
set
.setLatest(
doc.valueOf("//RAW_SETS/LATEST/@id"),
doc.valueOf("//RAW_SETS/LATEST/@creationDate"),
doc.valueOf("//RAW_SETS/LATEST/@lastUpdate"));
set.setDirectory(doc.valueOf("//SET/@directory"));
final List expiredNodes = doc.selectNodes("//RAW_SETS/EXPIRED");
if (expiredNodes != null) {
for (int i = 0; i < expiredNodes.size(); i++) {
Element ex = (Element) expiredNodes.get(i);
set
.addExpired(
ex.attributeValue("id"),
ex.attributeValue("creationDate"),
ex.attributeValue("lastUpdate"));
}
}
final StringBuilder sb = new StringBuilder();
sb.append(basePath);
sb.append("/");
sb.append(doc.valueOf("//SET/@directory"));
sb.append("/");
sb.append(doc.valueOf("//RAW_SETS/LATEST/@id"));
set.setPathToLatest(sb.toString());
return set;
} catch (Exception e) {
throw new ActionManagerException("Error creating set from profile: " + profile, e);
Document doc = new SAXReader().read(new StringReader(set));
return Triple
.of(
doc.valueOf("//SET/@id"),
doc.valueOf("//SET/@directory"),
doc.valueOf("//SET/@latest"));
} catch (DocumentException e) {
throw new IllegalStateException(e);
}
}
private String buildDirectory(String basePath, Triple<String, String, String> t) {
return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight());
}
private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException {
return queryServiceProperty(isLookup, "basePath");
}

View File

@ -160,9 +160,9 @@ public class PromoteActionPayloadForGraphTableJob {
private static String extractPayload(Row value) {
try {
return value.<String> getAs("payload");
return value.getAs("payload");
} catch (IllegalArgumentException | ClassCastException e) {
logger.error("cannot extract payload from action: {}", value.toString());
logger.error("cannot extract payload from action: {}", value);
throw e;
}
}

View File

@ -75,7 +75,6 @@ public class CollectAndSave implements Serializable {
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
;
}
private static void removeOutputDir(SparkSession spark, String path) {

View File

@ -36,7 +36,7 @@ import scala.Tuple2;
*/
public class SparkAtomicActionScoreJob implements Serializable {
private static String DOI = "doi";
private static final String DOI = "doi";
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

View File

@ -17,6 +17,7 @@ import org.json4s.jackson.JsonMethods.parse
import java.nio.charset.CodingErrorAction
import java.text.SimpleDateFormat
import java.time.LocalDate
import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import java.util.regex.Pattern
@ -164,6 +165,16 @@ object DataciteToOAFTransformation {
d
}
def fix_thai_date(input:String, format:String) :String = {
try {
val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format))
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
LocalDate.from(d).toString
} catch {
case _: Throwable => ""
}
}
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
@ -377,17 +388,31 @@ object DataciteToOAFTransformation {
.map(d => d.get)
if (a_date.isDefined) {
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
if(doi.startsWith("10.14457"))
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null))
else
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
}
if (i_date.isDefined && i_date.get.isDefined) {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
}
else {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
}
}
else if (publication_year != null) {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
}
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
} else {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
}
}
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
.map(d => (extract_date(d.date.get), d.dateType.get))
@ -468,11 +493,11 @@ object DataciteToOAFTransformation {
JField("awardUri", JString(awardUri)) <- fundingReferences
} yield awardUri
val relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
fix_figshare(result)
result.setId(IdentifierFactory.createIdentifier(result))
if (result.getId == null)
return List()
val relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
fix_figshare(result)
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
}

View File

@ -56,6 +56,7 @@ object ImportDatacite {
val hdfsTargetPath = new Path(targetPath)
log.info(s"hdfsTargetPath is $hdfsTargetPath")
val bs = if (parser.get("blocksize") == null) 100 else parser.get("blocksize").toInt
val spkipImport = parser.get("skipImport")
log.info(s"skipImport is $spkipImport")
@ -110,7 +111,7 @@ object ImportDatacite {
println(s"last Timestamp is $ts")
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf)
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
println(s"Imported from Datacite API $cnt documents")
@ -137,7 +138,7 @@ object ImportDatacite {
}
}
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration): Long = {
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = {
var from:Long = timestamp * 1000
val delta:Long = 50000000L
var client: DataciteAPIImporter = null
@ -148,7 +149,7 @@ object ImportDatacite {
try {
var start: Long = System.currentTimeMillis
while (from < now) {
client = new DataciteAPIImporter(from, 100, from + delta)
client = new DataciteAPIImporter(from, bs, from + delta)
var end: Long = 0
val key: IntWritable = new IntWritable(i)
val value: Text = new Text

View File

@ -143,24 +143,8 @@ public class PrepareProgramme {
JavaRDD<CSVProgramme> h2020Programmes = programme
.toJavaRDD()
.filter(p -> p.getFrameworkProgramme().trim().equalsIgnoreCase("H2020"))
.mapToPair(csvProgramme -> new Tuple2<>(csvProgramme.getCode(), csvProgramme))
.reduceByKey((a, b) -> {
if (!a.getLanguage().equals("en")) {
if (b.getLanguage().equalsIgnoreCase("en")) {
a.setTitle(b.getTitle());
a.setLanguage(b.getLanguage());
}
}
if (StringUtils.isEmpty(a.getShortTitle())) {
if (!StringUtils.isEmpty(b.getShortTitle())) {
a.setShortTitle(b.getShortTitle());
}
}
return a;
})
.reduceByKey(PrepareProgramme::groupProgrammeByCode)
.map(p -> {
CSVProgramme csvProgramme = p._2();
String programmeTitle = csvProgramme.getTitle().trim();
@ -177,20 +161,31 @@ public class PrepareProgramme {
return csvProgramme;
});
// prepareClassification(h2020Programmes);
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<CSVProgramme> rdd = jsc.parallelize(prepareClassification(h2020Programmes), 1);
rdd
.map(csvProgramme -> {
String tmp = OBJECT_MAPPER.writeValueAsString(csvProgramme);
return tmp;
})
.map(OBJECT_MAPPER::writeValueAsString)
.saveAsTextFile(outputPath);
}
private static CSVProgramme groupProgrammeByCode(CSVProgramme a, CSVProgramme b) {
if (!a.getLanguage().equals("en")) {
if (b.getLanguage().equalsIgnoreCase("en")) {
a.setTitle(b.getTitle());
a.setLanguage(b.getLanguage());
}
}
if (StringUtils.isEmpty(a.getShortTitle())) {
if (!StringUtils.isEmpty(b.getShortTitle())) {
a.setShortTitle(b.getShortTitle());
}
}
return a;
}
private static List<CSVProgramme> prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
Object[] codedescription = h2020Programmes
.map(
@ -241,15 +236,15 @@ public class PrepareProgramme {
if (!ent.contains("Euratom")) {
String parent;
String tmp_key = tmp[0] + ".";
String tmpKey = tmp[0] + ".";
for (int i = 1; i < tmp.length - 1; i++) {
tmp_key += tmp[i] + ".";
parent = map.get(tmp_key)._1().toLowerCase().trim();
tmpKey += tmp[i] + ".";
parent = map.get(tmpKey)._1().toLowerCase().trim();
if (parent.contains("|")) {
parent = parent.substring(parent.lastIndexOf("|") + 1).trim();
}
if (current.trim().length() > parent.length()
&& current.toLowerCase().trim().substring(0, parent.length()).equals(parent)) {
&& current.toLowerCase().trim().startsWith(parent)) {
current = current.substring(parent.length() + 1);
if (current.trim().charAt(0) == '-' || current.trim().charAt(0) == '') {
current = current.trim().substring(1).trim();

View File

@ -18,7 +18,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme;
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
@ -32,7 +31,6 @@ public class PrepareProjects {
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final HashMap<String, CSVProgramme> programmeMap = new HashMap<>();
public static void main(String[] args) throws Exception {
@ -93,7 +91,7 @@ public class PrepareProjects {
}
private static FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject> getTuple2CSVProjectFlatMapFunction() {
return (FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject>) value -> {
return value -> {
Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
List<CSVProject> csvProjectList = new ArrayList<>();
if (csvProject.isPresent()) {

View File

@ -120,7 +120,6 @@ public class SparkAtomicActionJob {
.map((MapFunction<Tuple2<CSVProject, CSVProgramme>, Project>) c -> {
CSVProject csvProject = c._1();
Optional<CSVProgramme> ocsvProgramme = Optional.ofNullable(c._2());
return Optional
.ofNullable(c._2())
@ -135,9 +134,9 @@ public class SparkAtomicActionJob {
H2020Programme pm = new H2020Programme();
H2020Classification h2020classification = new H2020Classification();
pm.setCode(csvProject.getProgramme());
h2020classification.setClassification(ocsvProgramme.get().getClassification());
h2020classification.setClassification(csvProgramme.getClassification());
h2020classification.setH2020Programme(pm);
setLevelsandProgramme(h2020classification, ocsvProgramme.get().getClassification_short());
setLevelsandProgramme(h2020classification, csvProgramme.getClassification_short());
// setProgramme(h2020classification, ocsvProgramme.get().getClassification());
pp.setH2020classification(Arrays.asList(h2020classification));
@ -145,10 +144,11 @@ public class SparkAtomicActionJob {
})
.orElse(null);
}, Encoders.bean(Project.class));
}, Encoders.bean(Project.class))
.filter(Objects::nonNull);
aaproject
.joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")))
.joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")), "left")
.map((MapFunction<Tuple2<Project, EXCELTopic>, Project>) p -> {
Optional<EXCELTopic> op = Optional.ofNullable(p._2());
Project rp = p._1();

View File

@ -7,14 +7,7 @@ import java.io.Serializable;
* The model for the programme csv file
*/
public class CSVProgramme implements Serializable {
private String parentProgramme;
private String frameworkProgramme;
private String startDate;
private String endDate;
private String objective;
private String subjects;
private String legalBasis;
private String call;
private String rcn;
private String code;
@ -80,67 +73,5 @@ public class CSVProgramme implements Serializable {
this.language = language;
}
public String getParentProgramme() {
return parentProgramme;
}
public void setParentProgramme(String parentProgramme) {
this.parentProgramme = parentProgramme;
}
public String getFrameworkProgramme() {
return frameworkProgramme;
}
public void setFrameworkProgramme(String frameworkProgramme) {
this.frameworkProgramme = frameworkProgramme;
}
public String getStartDate() {
return startDate;
}
public void setStartDate(String startDate) {
this.startDate = startDate;
}
public String getEndDate() {
return endDate;
}
public void setEndDate(String endDate) {
this.endDate = endDate;
}
public String getObjective() {
return objective;
}
public void setObjective(String objective) {
this.objective = objective;
}
public String getSubjects() {
return subjects;
}
public void setSubjects(String subjects) {
this.subjects = subjects;
}
public String getLegalBasis() {
return legalBasis;
}
public void setLegalBasis(String legalBasis) {
this.legalBasis = legalBasis;
}
public String getCall() {
return call;
}
public void setCall(String call) {
this.call = call;
}
//
}

View File

@ -22,15 +22,18 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook;
*/
public class EXCELParser {
public <R> List<R> parse(InputStream file, String classForName)
public <R> List<R> parse(InputStream file, String classForName, String sheetName)
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException,
InvalidFormatException {
// OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL));
OPCPackage pkg = OPCPackage.open(file);
XSSFWorkbook wb = new XSSFWorkbook(pkg);
XSSFSheet sheet = wb.getSheet("cordisref-H2020topics");
XSSFSheet sheet = wb.getSheet(sheetName);
if (sheetName == null) {
throw new RuntimeException("Sheet name " + sheetName + " not present in current file");
}
List<R> ret = new ArrayList<>();
@ -49,12 +52,11 @@ public class EXCELParser {
headers.add(dataFormatter.formatCellValue(cell));
}
} else {
Class<?> clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic");
Class<?> clazz = Class.forName(classForName);
final Object cc = clazz.newInstance();
for (int i = 0; i < headers.size(); i++) {
Cell cell = row.getCell(i);
String value = dataFormatter.formatCellValue(cell);
FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true);
}

View File

@ -28,7 +28,7 @@ public class ReadCSV implements Closeable {
private final Configuration conf;
private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private String csvFile;
private final String csvFile;
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -85,7 +85,6 @@ public class ReadCSV implements Closeable {
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
this.csvFile = httpConnector.getInputSource(fileURL);
;
}
protected void write(final Object p) {

View File

@ -25,7 +25,7 @@ public class ReadExcel implements Closeable {
private final Configuration conf;
private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private InputStream excelFile;
private final InputStream excelFile;
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -41,19 +41,20 @@ public class ReadExcel implements Closeable {
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("hdfsNameNode");
final String classForName = parser.get("classForName");
final String sheetName = parser.get("sheetName");
try (final ReadExcel readExcel = new ReadExcel(hdfsPath, hdfsNameNode, fileURL)) {
log.info("Getting Excel file...");
readExcel.execute(classForName);
readExcel.execute(classForName, sheetName);
}
}
public void execute(final String classForName) throws Exception {
public void execute(final String classForName, final String sheetName) throws Exception {
EXCELParser excelParser = new EXCELParser();
excelParser
.parse(excelFile, classForName)
.parse(excelFile, classForName, sheetName)
.stream()
.forEach(p -> write(p));
@ -82,7 +83,6 @@ public class ReadExcel implements Closeable {
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
this.excelFile = httpConnector.getInputSourceAsStream(fileURL);
;
}
protected void write(final Object p) {

View File

@ -0,0 +1,215 @@
package eu.dnetlib.dhp.actionmanager.ror;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob;
import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType;
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
public class GenerateRorActionSetJob {
private static final Logger log = LoggerFactory.getLogger(GenerateRorActionSetJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String ROR_NS_PREFIX = "ror_________";
private static final List<KeyValue> ROR_COLLECTED_FROM = listKeyValues(
"10|openaire____::993a7ae7a863813cf95028b50708e222", "ROR");
private static final DataInfo ROR_DATA_INFO = dataInfo(
false, "", false, false, ENTITYREGISTRY_PROVENANCE_ACTION, "0.92");
private static final Qualifier ROR_PID_TYPE = qualifier(
"ROR", "ROR", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES);
public static void main(final String[] args) throws Exception {
final String jsonConfiguration = IOUtils
.toString(
SparkAtomicActionJob.class
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
removeOutputDir(spark, outputPath);
processRorOrganizations(spark, inputPath, outputPath);
});
}
private static void removeOutputDir(final SparkSession spark, final String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
private static void processRorOrganizations(final SparkSession spark,
final String inputPath,
final String outputPath) throws Exception {
readInputPath(spark, inputPath)
.map(
(MapFunction<RorOrganization, Organization>) GenerateRorActionSetJob::convertRorOrg,
Encoders.bean(Organization.class))
.toJavaRDD()
.map(o -> new AtomicAction<>(Organization.class, o))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
protected static Organization convertRorOrg(final RorOrganization r) {
final Date now = new Date();
final Organization o = new Organization();
o.setId(String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(r.getId())));
o.setOriginalId(Arrays.asList(String.format("%s::%s", ROR_NS_PREFIX, r.getId())));
o.setCollectedfrom(ROR_COLLECTED_FROM);
o.setPid(pids(r));
o.setDateofcollection(now.toString());
o.setDateoftransformation(now.toString());
o.setExtraInfo(new ArrayList<>()); // Values not present in the file
o.setOaiprovenance(null); // Values not present in the file
o.setLegalshortname(field(r.getAcronyms().stream().findFirst().orElse(r.getName()), ROR_DATA_INFO));
o.setLegalname(field(r.getName(), ROR_DATA_INFO));
o.setAlternativeNames(alternativeNames(r));
o.setWebsiteurl(field(r.getLinks().stream().findFirst().orElse(null), ROR_DATA_INFO));
o.setLogourl(null);
o.setEclegalbody(null);
o.setEclegalperson(null);
o.setEcnonprofit(null);
o.setEcresearchorganization(null);
o.setEchighereducation(null);
o.setEcinternationalorganizationeurinterests(null);
o.setEcinternationalorganization(null);
o.setEcenterprise(null);
o.setEcsmevalidated(null);
o.setEcnutscode(null);
if (r.getCountry() != null) {
o
.setCountry(
qualifier(
r.getCountry().getCountryCode(), r
.getCountry()
.getCountryName(),
ModelConstants.DNET_COUNTRY_TYPE, ModelConstants.DNET_COUNTRY_TYPE));
} else {
o.setCountry(null);
}
o.setDataInfo(ROR_DATA_INFO);
o.setLastupdatetimestamp(now.getTime());
return o;
}
private static List<StructuredProperty> pids(final RorOrganization r) {
final List<StructuredProperty> pids = new ArrayList<>();
pids.add(structuredProperty(r.getId(), ROR_PID_TYPE, ROR_DATA_INFO));
for (final Map.Entry<String, ExternalIdType> e : r.getExternalIds().entrySet()) {
final String type = e.getKey();
final List<String> all = e.getValue().getAll();
if (all != null) {
final Qualifier qualifier = qualifier(
type, type, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES);
for (final String pid : all) {
pids
.add(structuredProperty(pid, qualifier, ROR_DATA_INFO));
}
}
}
return pids;
}
private static List<Field<String>> alternativeNames(final RorOrganization r) {
final Set<String> names = new LinkedHashSet<>();
names.addAll(r.getAliases());
names.addAll(r.getAcronyms());
r.getLabels().forEach(l -> names.add(l.getLabel()));
return names
.stream()
.filter(StringUtils::isNotBlank)
.map(s -> field(s, ROR_DATA_INFO))
.collect(Collectors.toList());
}
private static Dataset<RorOrganization> readInputPath(
final SparkSession spark,
final String path) throws Exception {
try (final FileSystem fileSystem = FileSystem.get(new Configuration());
final InputStream is = fileSystem.open(new Path(path))) {
final RorOrganization[] arr = OBJECT_MAPPER.readValue(is, RorOrganization[].class);
return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class));
}
}
}

View File

@ -0,0 +1,122 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class Address implements Serializable {
@JsonProperty("lat")
private Float lat;
@JsonProperty("state_code")
private String stateCode;
@JsonProperty("country_geonames_id")
private Integer countryGeonamesId;
@JsonProperty("lng")
private Float lng;
@JsonProperty("state")
private String state;
@JsonProperty("city")
private String city;
@JsonProperty("geonames_city")
private GeonamesCity geonamesCity;
@JsonProperty("postcode")
private String postcode;
@JsonProperty("primary")
private Boolean primary;
@JsonProperty("line")
private String line;
private final static long serialVersionUID = 2444635485253443195L;
public Float getLat() {
return lat;
}
public void setLat(final Float lat) {
this.lat = lat;
}
public String getStateCode() {
return stateCode;
}
public void setStateCode(final String stateCode) {
this.stateCode = stateCode;
}
public Integer getCountryGeonamesId() {
return countryGeonamesId;
}
public void setCountryGeonamesId(final Integer countryGeonamesId) {
this.countryGeonamesId = countryGeonamesId;
}
public Float getLng() {
return lng;
}
public void setLng(final Float lng) {
this.lng = lng;
}
public String getState() {
return state;
}
public void setState(final String state) {
this.state = state;
}
public String getCity() {
return city;
}
public void setCity(final String city) {
this.city = city;
}
public GeonamesCity getGeonamesCity() {
return geonamesCity;
}
public void setGeonamesCity(final GeonamesCity geonamesCity) {
this.geonamesCity = geonamesCity;
}
public String getPostcode() {
return postcode;
}
public void setPostcode(final String postcode) {
this.postcode = postcode;
}
public Boolean getPrimary() {
return primary;
}
public void setPrimary(final Boolean primary) {
this.primary = primary;
}
public String getLine() {
return line;
}
public void setLine(final String line) {
this.line = line;
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class Country implements Serializable {
@JsonProperty("country_code")
private String countryCode;
@JsonProperty("country_name")
private String countryName;
private final static long serialVersionUID = 4357848706229493627L;
public String getCountryCode() {
return countryCode;
}
public void setCountryCode(final String countryCode) {
this.countryCode = countryCode;
}
public String getCountryName() {
return countryName;
}
public void setCountryName(final String countryName) {
this.countryName = countryName;
}
}

View File

@ -0,0 +1,42 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import java.util.List;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
@JsonDeserialize(using = ExternalIdTypeDeserializer.class)
public class ExternalIdType implements Serializable {
private List<String> all;
private String preferred;
private final static long serialVersionUID = 2616688352998387611L;
public ExternalIdType() {
}
public ExternalIdType(final List<String> all, final String preferred) {
this.all = all;
this.preferred = preferred;
}
public List<String> getAll() {
return all;
}
public void setAll(final List<String> all) {
this.all = all;
}
public String getPreferred() {
return preferred;
}
public void setPreferred(final String preferred) {
this.preferred = preferred;
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.ObjectCodec;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonNode;
public class ExternalIdTypeDeserializer extends JsonDeserializer<ExternalIdType> {
@Override
public ExternalIdType deserialize(final JsonParser p, final DeserializationContext ctxt)
throws IOException, JsonProcessingException {
final ObjectCodec oc = p.getCodec();
final JsonNode node = oc.readTree(p);
final JsonNode allNode = node.get("all");
final String preferred = node.get("preferred").asText();
final List<String> all = new ArrayList<>();
if (allNode.isArray()) {
allNode.elements().forEachRemaining(x -> all.add(x.asText()));
} else {
all.add(allNode.asText());
}
return new ExternalIdType(all, preferred);
}
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class GeonamesAdmin implements Serializable {
@JsonProperty("ascii_name")
private String asciiName;
@JsonProperty("id")
private Integer id;
@JsonProperty("name")
private String name;
@JsonProperty("code")
private String code;
private final static long serialVersionUID = 7294958526269195673L;
public String getAsciiName() {
return asciiName;
}
public void setAsciiName(final String asciiName) {
this.asciiName = asciiName;
}
public Integer getId() {
return id;
}
public void setId(final Integer id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(final String name) {
this.name = name;
}
public String getCode() {
return code;
}
public void setCode(final String code) {
this.code = code;
}
}

View File

@ -0,0 +1,100 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class GeonamesCity implements Serializable {
@JsonProperty("geonames_admin1")
private GeonamesAdmin geonamesAdmin1;
@JsonProperty("geonames_admin2")
private GeonamesAdmin geonamesAdmin2;
@JsonProperty("city")
private String city;
@JsonProperty("id")
private Integer id;
@JsonProperty("nuts_level1")
private NameAndCode nutsLevel1;
@JsonProperty("nuts_level2")
private NameAndCode nutsLevel2;
@JsonProperty("nuts_level3")
private NameAndCode nutsLevel3;
@JsonProperty("license")
private License license;
private final static long serialVersionUID = -8389480201526252955L;
public NameAndCode getNutsLevel2() {
return nutsLevel2;
}
public void setNutsLevel2(final NameAndCode nutsLevel2) {
this.nutsLevel2 = nutsLevel2;
}
public GeonamesAdmin getGeonamesAdmin2() {
return geonamesAdmin2;
}
public void setGeonamesAdmin2(final GeonamesAdmin geonamesAdmin2) {
this.geonamesAdmin2 = geonamesAdmin2;
}
public GeonamesAdmin getGeonamesAdmin1() {
return geonamesAdmin1;
}
public void setGeonamesAdmin1(final GeonamesAdmin geonamesAdmin1) {
this.geonamesAdmin1 = geonamesAdmin1;
}
public String getCity() {
return city;
}
public void setCity(final String city) {
this.city = city;
}
public Integer getId() {
return id;
}
public void setId(final Integer id) {
this.id = id;
}
public NameAndCode getNutsLevel1() {
return nutsLevel1;
}
public void setNutsLevel1(final NameAndCode nutsLevel1) {
this.nutsLevel1 = nutsLevel1;
}
public NameAndCode getNutsLevel3() {
return nutsLevel3;
}
public void setNutsLevel3(final NameAndCode nutsLevel3) {
this.nutsLevel3 = nutsLevel3;
}
public License getLicense() {
return license;
}
public void setLicense(final License license) {
this.license = license;
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class Label implements Serializable {
@JsonProperty("iso639")
private String iso639;
@JsonProperty("label")
private String label;
private final static long serialVersionUID = -6576156103297850809L;
public String getIso639() {
return iso639;
}
public void setIso639(final String iso639) {
this.iso639 = iso639;
}
public String getLabel() {
return label;
}
public void setLabel(final String label) {
this.label = label;
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class License implements Serializable {
@JsonProperty("attribution")
private String attribution;
@JsonProperty("license")
private String license;
private final static long serialVersionUID = -194308261058176439L;
public String getAttribution() {
return attribution;
}
public void setAttribution(final String attribution) {
this.attribution = attribution;
}
public String getLicense() {
return license;
}
public void setLicense(final String license) {
this.license = license;
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class NameAndCode implements Serializable {
@JsonProperty("name")
private String name;
@JsonProperty("code")
private String code;
private final static long serialVersionUID = 5459836979206140843L;
public String getName() {
return name;
}
public void setName(final String name) {
this.name = name;
}
public String getCode() {
return code;
}
public void setCode(final String code) {
this.code = code;
}
}

View File

@ -0,0 +1,45 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class Relationship implements Serializable {
@JsonProperty("type")
private String type;
@JsonProperty("id")
private String id;
@JsonProperty("label")
private String label;
private final static long serialVersionUID = 7847399503395576960L;
public String getType() {
return type;
}
public void setType(final String type) {
this.type = type;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getLabel() {
return label;
}
public void setLabel(final String label) {
this.label = label;
}
}

View File

@ -0,0 +1,192 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonProperty;
public class RorOrganization implements Serializable {
@JsonProperty("ip_addresses")
private List<String> ipAddresses = new ArrayList<>();
@JsonProperty("aliases")
private List<String> aliases = new ArrayList<>();
@JsonProperty("acronyms")
private List<String> acronyms = new ArrayList<>();
@JsonProperty("links")
private List<String> links = new ArrayList<>();
@JsonProperty("country")
private Country country;
@JsonProperty("name")
private String name;
@JsonProperty("wikipedia_url")
private String wikipediaUrl;
@JsonProperty("addresses")
private List<Address> addresses = new ArrayList<>();
@JsonProperty("types")
private List<String> types = new ArrayList<>();
@JsonProperty("established")
private Integer established;
@JsonProperty("relationships")
private List<Relationship> relationships = new ArrayList<>();
@JsonProperty("email_address")
private String emailAddress;
@JsonProperty("external_ids")
private Map<String, ExternalIdType> externalIds = new LinkedHashMap<>();
@JsonProperty("id")
private String id;
@JsonProperty("labels")
private List<Label> labels = new ArrayList<>();
@JsonProperty("status")
private String status;
private final static long serialVersionUID = -2658312087616043225L;
public List<String> getIpAddresses() {
return ipAddresses;
}
public void setIpAddresses(final List<String> ipAddresses) {
this.ipAddresses = ipAddresses;
}
public List<String> getAliases() {
return aliases;
}
public void setAliases(final List<String> aliases) {
this.aliases = aliases;
}
public List<String> getAcronyms() {
return acronyms;
}
public void setAcronyms(final List<String> acronyms) {
this.acronyms = acronyms;
}
public List<String> getLinks() {
return links;
}
public void setLinks(final List<String> links) {
this.links = links;
}
public Country getCountry() {
return country;
}
public void setCountry(final Country country) {
this.country = country;
}
public String getName() {
return name;
}
public void setName(final String name) {
this.name = name;
}
public String getWikipediaUrl() {
return wikipediaUrl;
}
public void setWikipediaUrl(final String wikipediaUrl) {
this.wikipediaUrl = wikipediaUrl;
}
public List<Address> getAddresses() {
return addresses;
}
public void setAddresses(final List<Address> addresses) {
this.addresses = addresses;
}
public List<String> getTypes() {
return types;
}
public void setTypes(final List<String> types) {
this.types = types;
}
public Integer getEstablished() {
return established;
}
public void setEstablished(final Integer established) {
this.established = established;
}
public List<Relationship> getRelationships() {
return relationships;
}
public void setRelationships(final List<Relationship> relationships) {
this.relationships = relationships;
}
public String getEmailAddress() {
return emailAddress;
}
public void setEmailAddress(final String emailAddress) {
this.emailAddress = emailAddress;
}
public Map<String, ExternalIdType> getExternalIds() {
return externalIds;
}
public void setExternalIds(final Map<String, ExternalIdType> externalIds) {
this.externalIds = externalIds;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public List<Label> getLabels() {
return labels;
}
public void setLabels(final List<Label> labels) {
this.labels = labels;
}
public String getStatus() {
return status;
}
public void setStatus(final String status) {
this.status = status;
}
}

View File

@ -18,7 +18,7 @@ public abstract class ReportingJob {
*/
public static final int INITIAL_DELAY = 2;
private ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
private final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
protected final AggregatorReport report;

View File

@ -14,9 +14,9 @@ import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.rest.DNetRestClient;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
public class MDStoreActionNode {
private static final Logger log = LoggerFactory.getLogger(MDStoreActionNode.class);

View File

@ -16,7 +16,6 @@ import org.apache.hadoop.io.compress.DeflateCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
import eu.dnetlib.dhp.aggregation.common.ReportingJob;
@ -25,6 +24,7 @@ import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
public class CollectorWorker extends ReportingJob {

View File

@ -13,10 +13,10 @@ import org.apache.hadoop.fs.FileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.message.MessageSender;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
/**
* CollectorWorkerApplication is the main class responsible to start the metadata collection process, storing the outcomes
@ -30,7 +30,7 @@ public class CollectorWorkerApplication {
private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class);
private FileSystem fileSystem;
private final FileSystem fileSystem;
public CollectorWorkerApplication(FileSystem fileSystem) {
this.fileSystem = fileSystem;

View File

@ -28,8 +28,8 @@ import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
import eu.dnetlib.dhp.schema.mdstore.Provenance;
import scala.Tuple2;

View File

@ -32,7 +32,7 @@ public class HttpConnector2 {
private String responseType = null;
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
public HttpConnector2() {
this(new HttpClientParams());
@ -131,20 +131,12 @@ public class HttpConnector2 {
}
return attemptDownload(newUrl, retryNumber + 1, report);
}
if (is4xx(urlConn.getResponseCode())) {
// CLIENT ERROR, DO NOT RETRY
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String
.format(
"%s error: %s", requestUrl, urlConn.getResponseMessage()));
throw new CollectorException("4xx error: request will not be repeated. " + report);
}
if (is5xx(urlConn.getResponseCode())) {
// SERVER SIDE ERRORS RETRY ONLY on 503
if (is4xx(urlConn.getResponseCode()) || is5xx(urlConn.getResponseCode())) {
switch (urlConn.getResponseCode()) {
case HttpURLConnection.HTTP_NOT_FOUND:
case HttpURLConnection.HTTP_BAD_GATEWAY:
case HttpURLConnection.HTTP_UNAVAILABLE:
case HttpURLConnection.HTTP_GATEWAY_TIMEOUT:
if (retryAfter > 0) {
log
.warn(

View File

@ -11,6 +11,8 @@ import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.client.MongoCollection;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;

View File

@ -21,6 +21,9 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
public class OaiCollectorPlugin implements CollectorPlugin {
public static final String DATE_REGEX = "\\d{4}-\\d{2}-\\d{2}";
public static final String UTC_DATETIME_REGEX = "\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z";
private static final String FORMAT_PARAM = "format";
private static final String OAI_SET_PARAM = "set";
private static final Object OAI_FROM_DATE_PARAM = "fromDate";
@ -62,11 +65,11 @@ public class OaiCollectorPlugin implements CollectorPlugin {
throw new CollectorException("Param 'mdFormat' is null or empty");
}
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
if (fromDate != null && !fromDate.matches(DATE_REGEX) && !fromDate.matches(UTC_DATETIME_REGEX)) {
throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
}
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
if (untilDate != null && !untilDate.matches(DATE_REGEX) && !untilDate.matches(UTC_DATETIME_REGEX)) {
throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
}

View File

@ -42,7 +42,7 @@ public class OaiIterator implements Iterator<String> {
private String token;
private boolean started;
private final HttpConnector2 httpConnector;
private AggregatorReport report;
private final AggregatorReport report;
public OaiIterator(
final String baseUrl,
@ -107,10 +107,12 @@ public class OaiIterator implements Iterator<String> {
if (set != null && !set.isEmpty()) {
url += "&set=" + URLEncoder.encode(set, "UTF-8");
}
if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
if (fromDate != null && (fromDate.matches(OaiCollectorPlugin.DATE_REGEX)
|| fromDate.matches(OaiCollectorPlugin.UTC_DATETIME_REGEX))) {
url += "&from=" + URLEncoder.encode(fromDate, "UTF-8");
}
if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
if (untilDate != null && (untilDate.matches(OaiCollectorPlugin.DATE_REGEX)
|| untilDate.matches(OaiCollectorPlugin.UTC_DATETIME_REGEX))) {
url += "&until=" + URLEncoder.encode(untilDate, "UTF-8");
}
log.info("Start harvesting using url: " + url);

View File

@ -26,7 +26,7 @@ public class RestCollectorPlugin implements CollectorPlugin {
public static final String RESULT_SIZE_VALUE_DEFAULT = "100";
private HttpClientParams clientParams;
private final HttpClientParams clientParams;
public RestCollectorPlugin(HttpClientParams clientParams) {
this.clientParams = clientParams;

View File

@ -48,18 +48,18 @@ public class RestIterator implements Iterator<String> {
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
public static final String UTF_8 = "UTF-8";
private HttpClientParams clientParams;
private final HttpClientParams clientParams;
private final String BASIC = "basic";
private JsonUtils jsonUtils;
private final JsonUtils jsonUtils;
private String baseUrl;
private String resumptionType;
private String resumptionParam;
private String resultFormatValue;
private final String baseUrl;
private final String resumptionType;
private final String resumptionParam;
private final String resultFormatValue;
private String queryParams;
private int resultSizeValue;
private final int resultSizeValue;
private int resumptionInt = 0; // integer resumption token (first record to harvest)
private int resultTotal = -1;
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
@ -71,11 +71,11 @@ public class RestIterator implements Iterator<String> {
private XPathExpression xprResultTotalPath;
private XPathExpression xprResumptionPath;
private XPathExpression xprEntity;
private String queryFormat;
private String querySize;
private String authMethod;
private String authToken;
private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
private final String queryFormat;
private final String querySize;
private final String authMethod;
private final String authToken;
private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
private int discoverResultSize = 0;
private int pagination = 1;
/*
@ -83,7 +83,7 @@ public class RestIterator implements Iterator<String> {
* json. useful for cases when the target API expects a resultFormatValue != json, but the results are returned in
* json. An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format.
*/
private String resultOutputFormat;
private final String resultOutputFormat;
/** RestIterator class
* compatible to version 1.3.33
@ -229,7 +229,7 @@ public class RestIterator implements Iterator<String> {
resultStream = theHttpInputStream;
if ("json".equals(resultOutputFormat)) {
resultJson = IOUtils.toString(resultStream, UTF_8);
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
resultXml = jsonUtils.convertToXML(resultJson);
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
}

View File

@ -22,14 +22,13 @@ import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.message.MessageSender;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -37,7 +36,7 @@ public class TransformSparkJobNode {
private static final Logger log = LoggerFactory.getLogger(TransformSparkJobNode.class);
private static int RECORDS_PER_TASK = 200;
private static final int RECORDS_PER_TASK = 200;
public static void main(String[] args) throws Exception {

View File

@ -10,87 +10,11 @@ import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import net.sf.saxon.s9api.*;
public class DateCleaner implements ExtensionFunction, Serializable {
private final static List<Pattern> dateRegex = Arrays
.asList(
// Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
// M-D-Y
Pattern
.compile(
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
Pattern.MULTILINE),
// D-M-Y
Pattern
.compile(
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
Pattern.MULTILINE),
// Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE));
private final static Pattern incompleteDateRegex = Pattern
.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE);
private final static List<DateTimeFormatter> dformats = Arrays
.asList(
DateTimeFormatter
.ofPattern(
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
Locale.ENGLISH),
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN));
public String clean(final String inputDate) {
Optional<String> cleanedDate = dateRegex
.stream()
.map(
p -> {
final Matcher matcher = p.matcher(inputDate);
if (matcher.find())
return matcher.group(0);
else
return null;
})
.filter(Objects::nonNull)
.map(m -> {
Optional<String> cleanDate = dformats
.stream()
.map(f -> {
try {
LocalDate parsedDate = LocalDate.parse(m, f);
if (parsedDate != null)
return parsedDate.toString();
else
return null;
} catch (Throwable e) {
return null;
}
}
)
.filter(Objects::nonNull)
.findAny();
return cleanDate.orElse(null);
})
.filter(Objects::nonNull)
.findAny();
if (cleanedDate.isPresent())
return cleanedDate.get();
final Matcher matcher = incompleteDateRegex.matcher(inputDate);
if (matcher.find()) {
final Integer year = Integer.parseInt(matcher.group(1));
final Integer month = Integer.parseInt(matcher.group(4) == null ? "01" : matcher.group(4));
return String.format("%d-%02d-01", year, month);
}
return null;
}
@Override
public QName getName() {
return new QName(QNAME_BASE_URI + "/dateISO", "dateISO");
@ -117,4 +41,9 @@ public class DateCleaner implements ExtensionFunction, Serializable {
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
return new XdmAtomicValue(clean(currentValue));
}
// for backward compatibility with the existing unit tests
public String clean(String date) {
return GraphCleaningFunctions.cleanDate(date);
}
}

View File

@ -26,7 +26,7 @@ public class PersonCleaner implements ExtensionFunction, Serializable {
private List<String> surname = Lists.newArrayList();
private List<String> fullname = Lists.newArrayList();
private static Set<String> particles = null;
private static final Set<String> particles = null;
public PersonCleaner() {

View File

@ -20,6 +20,10 @@ public class XSLTTransformationFunction implements MapFunction<MetadataRecord, M
public final static String QNAME_BASE_URI = "http://eu/dnetlib/transform";
private final static String DATASOURCE_ID_PARAM = "varDataSourceId";
private final static String DATASOURCE_NAME_PARAM = "varOfficialName";
private final AggregationCounter aggregationCounter;
private final String transformationRule;
@ -48,11 +52,16 @@ public class XSLTTransformationFunction implements MapFunction<MetadataRecord, M
aggregationCounter.getTotalItems().add(1);
try {
Processor processor = new Processor(false);
processor.registerExtensionFunction(cleanFunction);
processor.registerExtensionFunction(new DateCleaner());
processor.registerExtensionFunction(new PersonCleaner());
final XsltCompiler comp = processor.newXsltCompiler();
QName datasourceIDParam = new QName(DATASOURCE_ID_PARAM);
comp.setParameter(datasourceIDParam, new XdmAtomicValue(value.getProvenance().getDatasourceId()));
QName datasourceNameParam = new QName(DATASOURCE_NAME_PARAM);
comp.setParameter(datasourceNameParam, new XdmAtomicValue(value.getProvenance().getDatasourceName()));
XsltExecutable xslt = comp
.compile(new StreamSource(IOUtils.toInputStream(transformationRule, StandardCharsets.UTF_8)));
XdmNode source = processor

View File

@ -18,6 +18,12 @@
"paramDescription": "avoid to downlaod new items but apply the previous update",
"paramRequired": false
},
{
"paramName": "bs",
"paramLongName": "blocksize",
"paramDescription": "define the requests block size",
"paramRequired": false
},
{
"paramName": "n",
"paramLongName": "namenode",

View File

@ -8,6 +8,12 @@
<name>isLookupUrl</name>
<description>The IS lookUp service endopoint</description>
</property>
<property>
<name>blocksize</name>
<value>100</value>
<description>The request block size</description>
</property>
</parameters>
<start to="ImportDatacite"/>
@ -37,6 +43,7 @@
<arg>--dataciteDumpPath</arg><arg>${mainPath}/datacite_dump</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
<arg>--blocksize</arg><arg>${blocksize}</arg>
</spark>
<ok to="TransformJob"/>
<error to="Kill"/>

View File

@ -1,4 +1,4 @@
<workflow-app name="H2020Programme" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="H2020Classification" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>projectFileURL</name>
@ -18,6 +18,10 @@
<name>outputPath</name>
<description>path where to store the action set</description>
</property>
<property>
<name>sheetName</name>
<description>the name of the sheet to read</description>
</property>
</parameters>
<start to="deleteoutputpath"/>
@ -31,10 +35,23 @@
<delete path='${workingDir}'/>
<mkdir path='${workingDir}'/>
</fs>
<ok to="get_project_file"/>
<ok to="fork_get_info"/>
<error to="Kill"/>
</action>
<fork name="fork_get_info">
<path start="fork_get_projects"/>
<path start="get_programme_file"/>
<path start="get_topic_file"/>
</fork>
<fork name="fork_get_projects">
<path start="get_project_file"/>
<path start="read_projects"/>
</fork>
<action name="get_project_file">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV</main-class>
@ -43,7 +60,7 @@
<arg>--hdfsPath</arg><arg>${workingDir}/projects</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.CSVProject</arg>
</java>
<ok to="get_programme_file"/>
<ok to="wait_projects"/>
<error to="Kill"/>
</action>
@ -55,7 +72,7 @@
<arg>--hdfsPath</arg><arg>${workingDir}/programme</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme</arg>
</java>
<ok to="get_topic_file"/>
<ok to="prepare_programme"/>
<error to="Kill"/>
</action>
@ -65,9 +82,10 @@
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--fileURL</arg><arg>${topicFileURL}</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/topic</arg>
<arg>--sheetName</arg><arg>${sheetName}</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic</arg>
</java>
<ok to="read_projects"/>
<ok to="wait"/>
<error to="Kill"/>
</action>
@ -80,7 +98,7 @@
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
</java>
<ok to="prepare_programme"/>
<ok to="wait_projects"/>
<error to="Kill"/>
</action>
@ -104,10 +122,15 @@
<arg>--programmePath</arg><arg>${workingDir}/programme</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedProgramme</arg>
</spark>
<ok to="prepare_project"/>
<ok to="wait"/>
<error to="Kill"/>
</action>
<join name="wait" to="create_updates"/>
<join name="wait_projects" to="prepare_project"/>
<action name="prepare_project">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -129,7 +152,7 @@
<arg>--outputPath</arg><arg>${workingDir}/preparedProjects</arg>
<arg>--dbProjectPath</arg><arg>${workingDir}/dbProjects</arg>
</spark>
<ok to="create_updates"/>
<ok to="wait"/>
<error to="Kill"/>
</action>

View File

@ -23,6 +23,11 @@
"paramLongName" : "classForName",
"paramDescription" : "the name of the class to deserialize the csv to",
"paramRequired" : true
}, {
"paramName": "sn",
"paramLongName" : "sheetName",
"paramDescription" : "the name of the sheet in case the file is excel",
"paramRequired" : false
}

View File

@ -0,0 +1,14 @@
[
{
"paramName": "i",
"paramLongName": "inputPath",
"paramDescription": "the path of the input json",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

View File

@ -0,0 +1,58 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>sparkExecutorNumber</name>
<value>4</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>sparkDriverMemory</name>
<value>15G</value>
</property>
<property>
<name>sparkExecutorMemory</name>
<value>6G</value>
</property>
<property>
<name>sparkExecutorCores</name>
<value>1</value>
</property>
</configuration>

View File

@ -0,0 +1,55 @@
<workflow-app name="Update_ROR_action_set" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>rorJsonInputPath</name>
<description>the path of the json</description>
</property>
<property>
<name>rorActionSetPath</name>
<description>path where to store the action set</description>
</property>
</parameters>
<start to="deleteoutputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="deleteoutputpath">
<fs>
<delete path='${rorActionSetPath}'/>
<mkdir path='${rorActionSetPath}'/>
<delete path='${workingDir}'/>
<mkdir path='${workingDir}'/>
</fs>
<ok to="processRorFile"/>
<error to="Kill"/>
</action>
<action name="processRorFile">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>ProcessRorFile</name>
<class>eu.dnetlib.dhp.actionmanager.ror.GenerateRorActionSetJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${rorJsonInputPath}</arg>
<arg>--outputPath</arg><arg>${rorActionSetPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -21,7 +21,7 @@ public class EXCELParserTest {
private static Path workingDir;
private HttpConnector2 httpConnector = new HttpConnector2();
private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx";
private static final String URL = "https://cordis.europa.eu/data/reference/cordisref-h2020topics.xlsx";
@BeforeAll
public static void beforeAll() throws IOException {
@ -36,9 +36,11 @@ public class EXCELParserTest {
EXCELParser excelParser = new EXCELParser();
List<Object> pl = excelParser
.parse(httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic");
.parse(
httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic",
"Topics");
Assertions.assertEquals(3837, pl.size());
Assertions.assertEquals(3878, pl.size());
}
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.actionmanager.ror;
import java.io.FileInputStream;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
import eu.dnetlib.dhp.schema.oaf.Organization;
@Disabled
class GenerateRorActionSetJobTest {
private static final ObjectMapper mapper = new ObjectMapper();
private static final String local_file_path = "/Users/michele/Downloads/ror-data-2021-04-06.json";
@BeforeEach
void setUp() throws Exception {
}
@Test
void testConvertRorOrg() throws Exception {
final RorOrganization r = mapper
.readValue(IOUtils.toString(getClass().getResourceAsStream("ror_org.json")), RorOrganization.class);
final Organization org = GenerateRorActionSetJob.convertRorOrg(r);
System.out.println(mapper.writeValueAsString(org));
}
@Test
void testConvertAllRorOrg() throws Exception {
final RorOrganization[] arr = mapper
.readValue(IOUtils.toString(new FileInputStream(local_file_path)), RorOrganization[].class);
for (final RorOrganization r : arr) {
GenerateRorActionSetJob.convertRorOrg(r);
}
}
}

View File

@ -18,7 +18,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@ExtendWith(MockitoExtension.class)

View File

@ -36,8 +36,8 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
import eu.dnetlib.dhp.schema.mdstore.Provenance;
import eu.dnetlib.dhp.transformation.TransformSparkJobNode;

View File

@ -25,22 +25,22 @@ public class RestCollectorPluginTest {
private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
private String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
private String resumptionType = "count";
private String resumptionParam = "from";
private String entityXpath = "//hits/hits";
private String resumptionXpath = "//hits";
private String resultTotalXpath = "//hits/total";
private String resultFormatParam = "format";
private String resultFormatValue = "json";
private String resultSizeParam = "size";
private String resultSizeValue = "10";
private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
private final String resumptionType = "count";
private final String resumptionParam = "from";
private final String entityXpath = "//hits/hits";
private final String resumptionXpath = "//hits";
private final String resultTotalXpath = "//hits/total";
private final String resultFormatParam = "format";
private final String resultFormatValue = "json";
private final String resultSizeParam = "size";
private final String resultSizeValue = "10";
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
private String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
// private String query = "=(sources:engrXiv AND type:preprint)";
private String protocolDescriptor = "rest_json2xml";
private ApiDescriptor api = new ApiDescriptor();
private final String protocolDescriptor = "rest_json2xml";
private final ApiDescriptor api = new ApiDescriptor();
private RestCollectorPlugin rcp;
@BeforeEach

View File

@ -20,20 +20,20 @@ public class RestIteratorTest {
private static final Logger log = LoggerFactory.getLogger(RestIteratorTest.class);
private String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
private String resumptionType = "count";
private String resumptionParam = "from";
private String resumptionXpath = "";
private String resultTotalXpath = "//hits/total";
private String entityXpath = "//hits/hits";
private String resultFormatParam = "format";
private String resultFormatValue = "Json"; // Change from lowerCase to one UpperCase
private String resultSizeParam = "size";
private String resultSizeValue = "10";
private String authMethod = "";
private String authToken = "";
private String resultOffsetParam = "cursor";
private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
private final String resumptionType = "count";
private final String resumptionParam = "from";
private final String resumptionXpath = "";
private final String resultTotalXpath = "//hits/total";
private final String entityXpath = "//hits/hits";
private final String resultFormatParam = "format";
private final String resultFormatValue = "Json"; // Change from lowerCase to one UpperCase
private final String resultSizeParam = "size";
private final String resultSizeValue = "10";
private final String authMethod = "";
private final String authToken = "";
private final String resultOffsetParam = "cursor";
private final String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
@Disabled
@Test

View File

@ -27,6 +27,7 @@ import org.mockito.junit.jupiter.MockitoExtension;
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest;
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
import eu.dnetlib.dhp.schema.mdstore.Provenance;
import eu.dnetlib.dhp.transformation.xslt.DateCleaner;
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@ -50,11 +51,11 @@ public class TransformationJobTest extends AbstractVocabularyTest {
@DisplayName("Test Date cleaner")
public void testDateCleaner() throws Exception {
DateCleaner dc = new DateCleaner();
assertEquals(dc.clean("20/09/1982"), "1982-09-20");
assertEquals(dc.clean("20-09-2002"), "2002-09-20");
assertEquals(dc.clean("2002-09-20"), "2002-09-20");
assertEquals(dc.clean("2002-9"), "2002-09-01");
assertEquals(dc.clean("2021"), "2021-01-01");
assertEquals("1982-09-20", dc.clean("20/09/1982"));
assertEquals("2002-09-20", dc.clean("20-09-2002"));
assertEquals("2002-09-20", dc.clean("2002-09-20"));
assertEquals("2002-09-01", dc.clean("2002-9"));
assertEquals("2021-01-01", dc.clean("2021"));
}
@Test
@ -63,6 +64,8 @@ public class TransformationJobTest extends AbstractVocabularyTest {
// We Set the input Record getting the XML from the classpath
final MetadataRecord mr = new MetadataRecord();
mr.setProvenance(new Provenance("DSID", "DSNAME", "PREFIX"));
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_zenodo.xml")));
// We Load the XSLT transformation Rule from the classpath
XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt");
@ -80,6 +83,7 @@ public class TransformationJobTest extends AbstractVocabularyTest {
// We Set the input Record getting the XML from the classpath
final MetadataRecord mr = new MetadataRecord();
mr.setProvenance(new Provenance("DSID", "DSNAME", "PREFIX"));
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml")));
// We Load the XSLT transformation Rule from the classpath
XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt");
@ -101,6 +105,7 @@ public class TransformationJobTest extends AbstractVocabularyTest {
// We Set the input Record getting the XML from the classpath
final MetadataRecord mr = new MetadataRecord();
mr.setProvenance(new Provenance("DSID", "DSNAME", "PREFIX"));
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml")));
// We Load the XSLT transformation Rule from the classpath
XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript);
@ -121,6 +126,7 @@ public class TransformationJobTest extends AbstractVocabularyTest {
// We Set the input Record getting the XML from the classpath
final MetadataRecord mr = new MetadataRecord();
mr.setProvenance(new Provenance("DSID", "DSNAME", "PREFIX"));
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_omicsdi.xml")));
// We Load the XSLT transformation Rule from the classpath
XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript);

View File

@ -0,0 +1,123 @@
{
"ip_addresses": [],
"aliases": [],
"acronyms": [
"ANU"
],
"links": [
"http://www.anu.edu.au/"
],
"country": {
"country_code": "AU",
"country_name": "Australia"
},
"name": "Australian National University",
"wikipedia_url": "http://en.wikipedia.org/wiki/Australian_National_University",
"addresses": [
{
"lat": -35.2778,
"state_code": "AU-ACT",
"country_geonames_id": 2077456,
"lng": 149.1205,
"state": "Australian Capital Territory",
"city": "Canberra",
"geonames_city": {
"nuts_level2": {
"name": null,
"code": null
},
"geonames_admin2": {
"ascii_name": null,
"id": null,
"name": null,
"code": null
},
"geonames_admin1": {
"ascii_name": "ACT",
"id": 2177478,
"name": "ACT",
"code": "AU.01"
},
"city": "Canberra",
"id": 2172517,
"nuts_level1": {
"name": null,
"code": null
},
"nuts_level3": {
"name": null,
"code": null
},
"license": {
"attribution": "Data from geonames.org under a CC-BY 3.0 license",
"license": "http://creativecommons.org/licenses/by/3.0/"
}
},
"postcode": null,
"primary": false,
"line": null
}
],
"types": [
"Education"
],
"established": 1946,
"relationships": [
{
"type": "Related",
"id": "https://ror.org/041c7s516",
"label": "Calvary Hospital"
},
{
"type": "Related",
"id": "https://ror.org/04h7nbn38",
"label": "Canberra Hospital"
},
{
"type": "Related",
"id": "https://ror.org/030jpqj15",
"label": "Goulburn Base Hospital"
},
{
"type": "Child",
"id": "https://ror.org/006a4jj40",
"label": "Mount Stromlo Observatory"
}
],
"email_address": null,
"external_ids": {
"Wikidata": {
"all": [
"Q127990"
],
"preferred": null
},
"OrgRef": {
"all": [
"285106"
],
"preferred": null
},
"ISNI": {
"all": [
"0000 0001 2180 7477"
],
"preferred": null
},
"FundRef": {
"all": [
"501100000995",
"501100001151",
"100009020"
],
"preferred": "501100000995"
},
"GRID": {
"all": "grid.1001.0",
"preferred": "grid.1001.0"
}
},
"id": "https://ror.org/019wvm592",
"labels": [],
"status": "active"
}

View File

@ -0,0 +1,143 @@
// from PROD 2021-07-06 , tf script of HAL with around 3mill. records
declare_script "dc_cleaning_OpenAIREplus_compliant_hal";
declare_ns oaf = "http://namespace.openaire.eu/oaf";
declare_ns dri = "http://www.driver-repository.eu/namespace/dri";
declare_ns dr = "http://www.driver-repository.eu/namespace/dr";
declare_ns dc = "http://purl.org/dc/elements/1.1/";
declare_ns prov = "http://www.openarchives.org/OAI/2.0/provenance";
declare_ns oai = "http://www.openarchives.org/OAI/2.0/";
declare_ns xs = "http://www.w3.org/2001/XMLSchema";
$var0 = "''";
$varFP7 = "'corda_______::'";
$varH2020 = "'corda__h2020::'";
$varDummy = "''";
static $varDatasourceid = getValue(PROFILEFIELD, [xpath:"concat('collection(&amp;apos;/db/DRIVER/RepositoryServiceResources&amp;apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&amp;quot;NamespacePrefix&amp;quot;][value=&amp;quot;', //oaf:datasourceprefix, '&amp;quot;]]')", xpath:"//EXTRA_FIELDS/FIELD[key='OpenAireDataSourceId']/value"]);
static $varRepoid = xpath:"//dri:repositoryId";
static $varOfficialname = getValue(PROFILEFIELD, [xpath:"concat('collection(&amp;apos;/db/DRIVER/RepositoryServiceResources&amp;apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&amp;quot;NamespacePrefix&amp;quot;][value=&amp;quot;', //oaf:datasourceprefix, '&amp;quot;]]')", xpath:"//CONFIGURATION/OFFICIAL_NAME"]);
dri:objIdentifier = xpath:"//dri:objIdentifier";
dri:repositoryId = $varRepoid;
dri:recordIdentifier = xpath:"//dri:recordIdentifier";
//
// communities - deactivated until received green light from DARIAH to mark community on prod also
// $varCommunity = xpath:"//*[local-name()='setSpec'][starts-with(., 'collection:DARIAH')]/'dariah'";
// concept should not appear with empty attribute id, i.e when there is no community - ugly, but seems to work (oaf:datasourceprefix = just any field available in all records)
// oaf:concept = set(xpath:"//oaf:datasourceprefix[string-length($varCommunity) gt 0]/''", @id = $varCommunity;);
//
// apply xpath:"//dc:contributor[starts-with(., 'European Project')]" if xpath:"string-length(replace(., '.*(\d{6,6}).*', '$1')) = 6" oaf:projectid = xpath:"concat($var1, replace(., '.*(\d{6,6}).*', '$1'))"; else $varDummy = "''";
apply xpath:"//dc:creator" if xpath:"string-length(.) &gt; 0 and normalize-space(.) != ','" dc:creator = xpath:"normalize-space(.)"; else $varDummy = "''";
if xpath:"//dc:title[string-length(.)&gt; 0]" $varDummy = "''"; else dc:coverage = skipRecord();
dc:title = xpath:"//dc:title[string-length(.) &gt; 0]/normalize-space(.)";
apply xpath:"//dc:subject" if xpath:"string-length(.) &gt; 0" dc:subject = xpath:"normalize-space(.)"; else $varDummy = "''";
apply xpath:"//dc:publisher" if xpath:"string-length(.) &gt; 0" dc:publisher = xpath:"normalize-space(.)"; else $varDummy = "''";
apply xpath:"//dc:source" if xpath:"string-length(.) &gt; 0" dc:source = xpath:"normalize-space(.)"; else $varDummy = "''";
dc:contributor = xpath:"//dc:contributor";
// dc:description = xpath:"//dc:description/normalize-space(.)";
//dc:description = xpath:"string-join(//dc:description/normalize-space(.), concat('; ',codepoints-to-string(10)))";
dc:description = xpath:"string-join(//dc:description/normalize-space(.), '; ')";
dc:format = xpath:"//dc:format";
$varHttpTest = "''";
oaf:fulltext = xpath:"//dc:identifier[starts-with(., 'http') and (ends-with(., 'document') or ends-with(., 'pdf'))]";
//if xpath:"//dc:identifier[starts-with(., 'http') and (ends-with(., 'document') or ends-with(., 'pdf'))] or //dc:relation[starts-with(lower-case(normalize-space(.)), 'info:eu-repo/grantagreement')] or //dc:rights[starts-with(lower-case(normalize-space(.)), 'open') or contains(lower-case(normalize-space(.)), 'openaccess')] or //dc:accessRights[contains(lower-case(normalize-space(.)), 'openaccess')]" $var0 = "''"; else dc:coverage = skipRecord();
if xpath:"//dc:identifier[starts-with(., 'http')]" $var0 = "''"; else dc:coverage = skipRecord();
apply xpath:"//dc:identifier" if xpath:"starts-with(normalize-space(.), 'http')" dc:identifier = xpath:"normalize-space(.)"; else dr:CobjIdentifier = xpath:"normalize-space(.)";
dr:dateOfCollection = xpath:"//dri:dateOfCollection";
static dr:dateOfTransformation = xpath:"current-dateTime()";
dc:type = xpath:"//dc:type";
dc:format = xpath:"//dc:format";
dc:date = xpath:"//dc:date";
dc:language = Convert(xpath:"//dc:language", Languages);
$varDateAccepted = Convert(xpath:"descendant-or-self::dc:date", DateISO8601, "yyyy-MM-dd", "min()");
if xpath:"starts-with($varDateAccepted, '0')" oaf:dateAccepted = $varDummy; else oaf:dateAccepted = $varDateAccepted;
$varEmbargoEnd = xpath:"//dc:date[matches(normalize-space(.), '(.*)(info:eu-repo/date/embargoEnd/)(\d\d\d\d-\d\d-\d\d)', 'i')][contains(lower-case(.), 'info:eu-repo')]/replace(normalize-space(.), '(.*)(info:eu-repo/date/embargoEnd/)(\d\d\d\d-\d\d-\d\d)', '$3', 'i')";
oaf:embargoenddate = $varEmbargoEnd;
// FP7
oaf:projectid = xpath:"distinct-values(//dc:relation[matches(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/fp7/)(\d\d\d\d\d\d)(.*)', 'i')][year-from-date(xs:date(max(($varDateAccepted, '0001-01-01')))) gt 2006][contains(lower-case(.), 'info:eu-repo')]/concat($varFP7, replace(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/fp7/)(\d\d\d\d\d\d)(.*)', '$3', 'i')))";
// H2020
oaf:projectid = xpath:"distinct-values(//dc:relation[matches(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/h2020/)(\d\d\d\d\d\d)(.*)', 'i')][year-from-date(xs:date(max(($varDateAccepted, '0001-01-01')))) gt 2012][contains(lower-case(.), 'info:eu-repo')]/concat($varH2020, replace(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/h2020/)(\d\d\d\d\d\d)(.*)', '$3', 'i')))";
// H2020 workaround for HAL
oaf:projectid = xpath:"distinct-values(//dc:relation[matches(normalize-space(.), '(.*)(info:eu-repo/grantagreement//)(\d\d\d\d\d\d)(.*)', 'i')][//dc:contributor[contains(lower-case(.), 'h2020')]][year-from-date(xs:date(max(($varDateAccepted, '0001-01-01')))) gt 2012][contains(lower-case(.), 'info:eu-repo')]/concat($varH2020, replace(normalize-space(.), '(.*)(info:eu-repo/grantagreement//)(\d\d\d\d\d\d)(.*)', '$3', 'i')))";
dc:relation = xpath:"//dc:relation";
//comment-js-09-10-2012 apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics')" dc:rights = empty; else dc:rights = xpath:"normalize-space(.)";
//
oaf:collectedDatasourceid = xpath:"$varDatasourceid";
//
//if xpath:"//dc:type[1]/lower-case(.) = 'text'" dr:CobjCategory = Convert(xpath:"reverse(//dc:type) | //oai:setSpec", TextTypologies); else dr:CobjCategory = Convert(xpath:"//dc:type | //oai:setSpec", TextTypologies);
$varCobjCategoryReverse = Convert(xpath:"insert-before(reverse(//dc:type) , 0, reverse(//oai:setSpec))", TextTypologies);
$varSuperTypeReverse = Convert(xpath:"normalize-space($varCobjCategoryReverse)", SuperTypes);
dr:CobjCategory = set(xpath:"//oaf:datasourceprefix[//dc:type[1]/lower-case(.) = ('text', 'info:eu-repo/semantics/other', 'other') or //oaf:datasourceprefix/lower-case(.) = 'openedition_']/$varCobjCategoryReverse", @type = $varSuperTypeReverse;);
$varCobjCategoryStraight = Convert(xpath:"insert-before(//dc:type , 100, //oai:setSpec)", TextTypologies);
$varSuperTypeStraight = Convert(xpath:"normalize-space($varCobjCategoryStraight)", SuperTypes);
dr:CobjCategory = set(xpath:"//oaf:datasourceprefix[not(//dc:type[1]/lower-case(.) = ('text', 'info:eu-repo/semantics/other', 'other'))]/$varCobjCategoryStraight", @type = $varSuperTypeStraight;);
//
// review level
// oaf:refereed = Convert(xpath:"//dc:description", ReviewLevels);
$varRefereedConvt = Convert(xpath:"(//dc:type, //oai:setSpec, //dc:description)", ReviewLevels);
$varRefereedDesct = xpath:"(//dc:description[matches(lower-case(.), '.*(this\s*book|it)\s*constitutes\s*the\s*(thoroughly\s*)?refereed') or matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*')]/'0001', //dc:description[matches(., '^version\s*(préliminaire.*|0$)')]/'0002')";
$varRefereedIdntf = xpath:"(//*[string(node-name(.)) = 'dc:identifier' and matches(lower-case(.), '(^|.*[\.\-_/\s\(\)])pre[\.\-_/\s\(\)]?prints?([\.\-_/\s\(\)].*)?$')][count(//dc:identifier) = 1]/'0002', //*[string(node-name(.)) = 'dc:identifier' and matches(lower-case(.), '(^|.*[\.\-_/\s\(\)])refereed([\.\-_/\s\(\)\d].*)?$')]/'0001', //*[string(node-name(.)) = 'dc:identifier' and contains(lower-case(.), '-peer-reviewed-article-')]/'0001')";
$varRefereed = xpath:"($varRefereedConvt, $varRefereedIdntf, $varRefereedDesct)";
if xpath:"count(index-of($varRefereed, '0001')) &gt;0" oaf:refereed = xpath:"'0001'"; else $varDummy= "''";
if xpath:"count(index-of($varRefereed, '0002')) &gt;0 and count(index-of($varRefereed, '0001')) = 0" oaf:refereed = xpath:"'0002'"; else $varDummy= "''";
//
apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics') and (xs:date( max( ($varEmbargoEnd, '0001-01-01') ) ) gt current-date())" oaf:accessrights = Convert(xpath:"normalize-space(.)", AccessRights); else dc:rights = xpath:".";
// apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics') " oaf:accessrights = Convert(xpath:"normalize-space(.)", AccessRights); else dc:rights = xpath:".";
//2021-06-01 ; acz ; next line to avoid to be OPEN as default, set to UNKNOWN , 2021-07-05 acz
//if xpath:"//dc:rights[starts-with(normalize-space(.), 'info:eu-repo/semantics') and not(xs:date( max( ($varEmbargoEnd, '0001-01-01') ) ) lt current-date())]" $var0 = "''"; else oaf:accessrights = "UNKNOWN";
oaf:license = xpath:"//dc:rights[starts-with(., 'http') or matches(., '^CC[- ]BY([- ](NC([- ](ND|SA))?|ND|SA))([- ]\d(\.\d)?)?$', 'i')]";
//
static oaf:collectedFrom = set("''", @name = $varOfficialname; , @id = $varDatasourceid;);
static oaf:hostedBy = set("''", @name = $varOfficialname; , @id = $varDatasourceid;);
//
//$varId = identifierExtract('["//dc:identifier", "//dc:relation"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/&lt;&gt;]*/[^\s"&lt;&gt;]+)');
$varIdDoi = identifierExtract('["//dc:identifier[starts-with(., \"info:\") or starts-with(., \"urn:\") or starts-with(., \"doi:\") or starts-with(., \"DOI:\") or starts-with(., \"Doi:\") or starts-with(., \"doi \") or starts-with(., \"DOI \") or starts-with(., \"Doi \") or starts-with(., \"10.\") or ((starts-with(., \"http\")) and contains(., \"doi.org/10.\"))]", "//dc:relation[starts-with(., \"info:eu-repo/semantics/altIdentifier/doi/10.\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/DOI/10.\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/Doi/10.\") or ((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/doi/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/DOI/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/Doi/http\")) and contains(., \"doi.org/10.\"))]"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/&lt;&gt;]*/[^\s"&lt;&gt;]+)');
$varIdHdl = identifierExtract('["//dc:identifier[starts-with(., \"HDL:\") and not(starts-with(., \"HDL: http\"))][not(contains(., \"123456789\"))]", "//dc:relation[starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/hdl/\") or (starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/url/\") and contains(., \"://hdl.handle.net/\"))]"]' , xpath:"./*[local-name()='record']" , '(?!(info:hdl:|://hdl.handle.net/|info:eu-repo/semantics/altIdentifier/hdl/))(\d.*)');
$varIdIsbn = xpath:"(//dc:identifier, //dc:source)[starts-with(lower-case(.), 'isbn') or starts-with(., '978') or starts-with(., '979')][(matches(., '(isbn[:\s]*)?97[89]-\d+-\d+-\d+-\d+$', 'i') and string-length(concat('97', substring-after(., '97'))) = 17) or matches(., '(isbn[:\s]*)?97[89]\d{10}$', 'i')]/replace(., 'isbn[:\s]*', '', 'i'), //dc:relation[starts-with(lower-case(.), 'info:eu-repo/semantics/altidentifier/isbn/')][(matches(., 'info:eu-repo/semantics/altIdentifier/isbn/97[89]-\d+-\d+-\d+-\d+$', 'i') and string-length(.) = 59) or matches(., 'info:eu-repo/semantics/altidentifier/isbn/97[89]\d{10}$', 'i')]/substring-after(lower-case(.), 'info:eu-repo/semantics/altidentifier/isbn/')";
$varIdBibc = identifierExtract('["//dc:identifier[starts-with(., \"BibCode:\") or starts-with(., \"BIBCODE:\") or (starts-with(., \"http:\") and contains(., \"bibcode=\"))]"]' , xpath:"./*[local-name()='record']" , '(^(BibCode:|BIBCODE:|http).*$)');
$varIdPtnt = identifierExtract('["//dc:identifier[starts-with(., \"Patent N°:\")]"]' , xpath:"./*[local-name()='record']" , '(^Patent N°:.*$)');
$varPmId = identifierExtract('["//dc:identifier[starts-with(normalize-space(.), \"PUBMED:\")]"]' , xpath:"./*[local-name()='record']" , '(?!PUBMED: )(\d+)');
$varIdPmc = identifierExtract('["//dc:identifier[starts-with(., \"PUBMEDCENTRAL:\") or (starts-with(., \"http\") and contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/PMC\"))]", "//dc:relation[starts-with(., \"info:eu-repo/semantics/altIdentifier/pmid/PMC\") or ((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/pmid/http\")) and contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/PMC\"))]"]' , xpath:"./*[local-name()='record']" , '(PMC\d+)');
//$varIdHal = identifierExtract('["//dc:identifier[starts-with(., \"ads-\") or starts-with(., \"anses-\") or starts-with(., \"artxibo-\") or starts-with(., \"bioemco-\") or starts-with(., \"cea-\") or starts-with(., \"cel-\") or starts-with(., \"cirad-\") or starts-with(., \"edutice-\") or starts-with(., \"emse-\") or starts-with(., \"EMSE-\") or starts-with(., \"ensl-\") or starts-with(., \"hal-\") or starts-with(., \"HAL-\") or starts-with(., \"halsde-\") or starts-with(., \"halshs-\") or starts-with(., \"hprints-\") or starts-with(., \"in2p3-\") or starts-with(., \"ineris-\") or starts-with(., \"inria-\") or starts-with(., \"Inria-\") or starts-with(., \"inserm-\") or starts-with(., \"insu-\") or starts-with(., \"INSU-\") or starts-with(., \"ird-\") or starts-with(., \"irsn-\") or starts-with(., \"jpa-\") or starts-with(., \"lirmm-\") or starts-with(., \"medihal-\") or starts-with(., \"meteo-\") or starts-with(., \"mnhn-\") or starts-with(., \"obspm-\") or starts-with(., \"pastel-\") or starts-with(., \"pasteur-\") or starts-with(., \"Pasteur-\") or starts-with(., \"peer-\") or starts-with(., \"ssa-\") or starts-with(., \"tel-\") or starts-with(., \"ujm-\") or starts-with(., \"ijn_\") or starts-with(., \"sic_\") or (starts-with(., \"http\") and (contains(., \"://hal.archives-ouvertes.fr/hal\") or contains(., \"://halshs.archives-ouvertes.fr/hal\") or contains(., \"://halsde.archives-ouvertes.fr/hal\") or contains(., \"://medihal.archives-ouvertes.fr/hal\")))]", "//dc:relation[((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\")) and (contains(., \"://hal.archives-ouvertes.fr/hal\") or contains(., \"://halshs.archives-ouvertes.fr/hal\") or contains(., \"://halsde.archives-ouvertes.fr/hal\")))]"]' , xpath:"./*[local-name()='record']" , '((ads|anses|artxibo|bioemco|cea|cel|cirad|edutice|emse|EMSE|ensl|hal|HAL|halsde|halshs|hprints|in2p3|ineris|inria|Inria|inserm|insu|INSU|ird|irsn|jpa|lirmm|medihal|meteo|mnhn|obspm|pastel|pasteur|Pasteur|peer|ssa|tel|ujm)-|(ijn|sic)_).*');
$varIdHal = identifierExtract('["//*[local-name() = \"recordIdentifier\"]"]' , xpath:"./*[local-name()='record']" , '(oai:HAL:.*)');
$varIdArxv = identifierExtract('["//dc:identifier[((starts-with(., \"http\") or starts-with(., \"ArXiv: http\")) and (contains(., \"://arxiv.org/abs/\") or contains(., \"://arxiv.org/pdf/\"))) or starts-with(., \"arXiv:\") or starts-with(., \"ARXIV:\")]", "//dc:relation[(starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/arxiv/\") and not(contains(., \"/arxiv/http\"))) or ((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\") or starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/arxiv/http\")) and (contains(., \"://arxiv.org/abs/\") or contains(., \"://arxiv.org/pdf/\")))]"]' , xpath:"./*[local-name()='record']" , '(?!(://arxiv.org/abs/|:eu-repo/semantics/altIdentifier/arxiv/))([a-zA-Z].*)');
$varIdWos = identifierExtract('["//dc:identifier[starts-with(., \"WOS:\") or starts-with(., \"wos: WOS:\")]", "//dc:relation[starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/wos/\")]"]' , xpath:"./*[local-name()='record']" , '(info.*|WOS:.+|wos: WOS:.+)');
//oaf:identifier = set(xpath:"$varId//value[not[. = '10.1145/nnnnnnn.nnnnnnn']]", @identifierType = "doi";);
oaf:identifier = set(xpath:"$varIdDoi//value[not(. = '10.1145/nnnnnnn.nnnnnnn')]", @identifierType = "doi";);
oaf:identifier = set(xpath:"$varIdHdl//value", @identifierType = "handle";);
oaf:identifier = set(xpath:"$varIdIsbn", @identifierType = "isbn";);
oaf:identifier = set(xpath:"($varIdBibc//value[not(starts-with(., 'http'))]/replace(., 'BIBCODE:\s*', ''), $varIdBibc//value[starts-with(., 'http') and contains(substring-after(., 'bibcode='), codepoints-to-string(38))]/substring-before(substring-after(., 'bibcode='), codepoints-to-string(38)), $varIdBibc//value[starts-with(., 'http') and not(contains(substring-after(., 'bibcode='), codepoints-to-string(38)))]/substring-after(., 'bibcode='))", @identifierType = "bibcode";);
oaf:identifier = set(xpath:"$varIdPtnt//value/normalize-space(substring-after(., 'Patent N°:'))", @identifierType = "patentNumber";);
oaf:identifier = set(xpath:"$varPmId//value", @identifierType = "pmid";);
oaf:identifier = set(xpath:"$varIdPmc//value", @identifierType = "pmcid";);
//oaf:identifier = set(xpath:"distinct-values($varIdHal//value/replace(., '(/document|/image|/file/.*)$', ''))", @identifierType = "hal";);
oaf:identifier = set(xpath:"distinct-values($varIdHal//value/replace(substring-after(., 'oai:HAL:'), '(v\d*)$', ''))", @identifierType = "hal";);
oaf:identifier = set(xpath:"distinct-values(($varIdArxv//value/normalize-space(replace(., '(https?://arxiv.org/abs/|https?://arxiv.org/pdf/|info:eu-repo/semantics/altIdentifier/arxiv/|info:eu-repo/semantics/altIdentifier/url/|info:eu-repo/semantics/altIdentifier/urn/|arXiv:|\.pdf)', '', 'i'))))", @identifierType = "arxiv";);
oaf:identifier = set(xpath:"$varIdWos//value/normalize-space(replace(., '(info:eu-repo/semantics/altIdentifier/wos/|WOS:|wos:)', ''))", @identifierType = "wos";);
oaf:identifier = set(xpath:"distinct-values(//dc:identifier[starts-with(., 'http') and contains(., $varIdHal//value/replace(substring-after(., 'oai:HAL:'), '(v\d*)$', ''))]/replace(., '(/document|/image|/file/.*)$', ''))", @identifierType = "landingPage";);
oaf:identifier = set(xpath:"distinct-values(//dc:identifier[starts-with(., 'http') and not(ends-with(., $varIdHal//value/replace(substring-after(., 'oai:HAL:'), '(v\d*)$', '')))])", @identifierType = "url";);
oaf:identifier = set(xpath:"//dri:recordIdentifier", @identifierType = "oai-original";);
oaf:datasourceprefix = xpath:"//oaf:datasourceprefix";
// journal data
// avoiding regular expressions, while a) correcting ISSNs with no - or other letters instead of - and b) ignoring any stuff after the ISSN (as e.g. print/online/...)
$varISSN = xpath:"//dc:source[starts-with(., 'ISSN:') and string-length(.) &gt; 12]/concat(substring(normalize-space(substring-after(., 'ISSN:')), 1, 4), '-', normalize-space(substring-after(., substring(normalize-space(substring-after(., 'ISSN:')), 1, 4))))";
//$varEISSN = xpath:"//dc:source[starts-with(., 'EISSN:') and string-length(.) &gt; 13]/normalize-space(substring-after(., 'ISSN:'))";
$varEISSN = xpath:"//dc:source[starts-with(., 'EISSN:') and string-length(.) &gt; 13]/concat(substring(normalize-space(substring-after(., 'EISSN:')), 1, 4), '-', normalize-space(substring-after(., substring(normalize-space(substring-after(., 'EISSN:')), 1, 4))))";
oaf:journal = set(xpath:"//oaf:datasourceprefix[$varISSN or $varEISSN]/''", @issn = xpath:"$varISSN";, @eissn = xpath:"$varEISSN";);
end

View File

@ -0,0 +1,140 @@
// from PROD 2021-07-06 , tf script of DOAJ with more than 6mill. records
declare_script "dc_cleaning_OpenAIREplus_compliant_doaj";
declare_ns oaf = "http://namespace.openaire.eu/oaf";
declare_ns dri = "http://www.driver-repository.eu/namespace/dri";
declare_ns dr = "http://www.driver-repository.eu/namespace/dr";
declare_ns dc = "http://purl.org/dc/elements/1.1/";
declare_ns prov = "http://www.openarchives.org/OAI/2.0/provenance";
$var0 = "''";
$varFP7 = "'corda_______::'";
$varH2020 = "'corda__h2020::'";
$varDummy = "''";
// $varUnknownRepoId = "'openaire____::55045bd2a65019fd8e6741a755395c8c'";
//
$varUnknownRepoId = "'openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18'";
$varUnknownRepoName = "'Unknown Repository'";
static $varDatasourceid = getValue(PROFILEFIELD, [xpath:"concat('collection(&amp;apos;/db/DRIVER/RepositoryServiceResources&amp;apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&amp;quot;NamespacePrefix&amp;quot;][value=&amp;quot;', //oaf:datasourceprefix, '&amp;quot;]]')", xpath:"//EXTRA_FIELDS/FIELD[key='OpenAireDataSourceId']/value"]);
static $varRepoid = xpath:"//dri:repositoryId";
static $varOfficialname = getValue(PROFILEFIELD, [xpath:"concat('collection(&amp;apos;/db/DRIVER/RepositoryServiceResources&amp;apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&amp;quot;NamespacePrefix&amp;quot;][value=&amp;quot;', //oaf:datasourceprefix, '&amp;quot;]]')", xpath:"//CONFIGURATION/OFFICIAL_NAME"]);
dri:objIdentifier = xpath:"//dri:objIdentifier";
dri:repositoryId = $varRepoid;
dri:recordIdentifier = xpath:"//dri:recordIdentifier";
if xpath:"//dc:creator[string-length(normalize-space(.)) &amp;gt; 0][contains(., 'CDATA')][starts-with(normalize-space(.), '(')][starts-with(normalize-space(.), '.')]" dc:creator = skipRecord(); else $varDummy = "''";
//apply xpath:"//dc:creator" if xpath:"string-length(normalize-space(.)) &amp;amp;gt; 0 and not(contains(., 'CDATA')) and not(starts-with(normalize-space(.), '.')) and not(starts-with(normalize-space(.), '('))" dc:creator = Convert(xpath:".", Person); else $varDummy = "''";
if xpath:"count(//dc:creator) = 0" dc:creator = skipRecord(); else $varDummy = "''";
//apply xpath:"//dc:creator" if xpath:"string-length(.) &amp;gt; 0 and normalize-space(.) != ','" dc:creator = xpath:"normalize-space(.)"; else $varDummy = "''";
$varOrcidName = xpath:"//dc:creator[string-length(normalize-space(.)) &gt; 0]";
$varOrcidOrcid = xpath:"//dc:creator[string-length(normalize-space(.)) &gt; 0]/@id/replace(., 'https?://orcid.org/', '')";
dc:creator = set(xpath:"$varOrcidName", @nameIdentifier = xpath:"subsequence($varOrcidOrcid,position(),1)";, @nameIdentifierScheme=xpath:"replace(subsequence($varOrcidOrcid,position(),1),'^.+$','ORCID')";, @schemeUri=xpath:"replace(subsequence($varOrcidOrcid,position(),1),'^.+$','http://orcid.org/')";);
if xpath:"count(//dc:title[string-length(.) &amp;gt; 0]) = 0" dc:title = skipRecord(); else $varDummy = "''";
dc:title = xpath:"//dc:title/normalize-space(replace(., '^(&amp;lt;title language=)(.)*(&amp;gt;)', ''))";
// apply xpath:"//dc:title" if xpath:"string-length(.) &amp;gt; 0" dc:title = xpath:"normalize-space(.)"; else $varDummy = "''";
apply xpath:"//dc:subject" if xpath:"string-length(.) &amp;gt; 0 and not(@xsi:type = 'dcterms:LCSH')" dc:subject = xpath:"normalize-space(.)"; else $varDummy = "''";
dc:subject = set(xpath:"//dc:subject[@xsi:type = 'dcterms:LCSH']/concat('lcsh:', .)", @classid=xpath:"'lcsh'";, @classname=xpath:"'lcsh'";, @schemeid=xpath:"'dnet:subject_classification_typologies'";, @schemename=xpath:"'dnet:subject_classification_typologies'";);
apply xpath:"//dc:publisher" if xpath:"string-length(.) &amp;gt; 0" dc:publisher = xpath:"normalize-space(replace(., '(&amp;lt;br&amp;gt;)', ''))"; else $varDummy = "''";
apply xpath:"//dc:source" if xpath:"string-length(.) &amp;gt; 0" dc:source = xpath:"normalize-space(.)"; else $varDummy = "''";
dc:contributor = xpath:"//dc:contributor";
dc:description = xpath:"//dc:description[not(starts-with(., 'URN: urn:nbn:') or starts-with(., 'URN: http'))]";
dc:format = xpath:"//dc:format";
$varHttpTest = "''";
if xpath:"//dc:relation[starts-with(., 'http') or starts-with(., 'www.')]" $varHttpTest = "true"; else dc:identifier = skipRecord();
//apply xpath:"//dc:relation" if xpath:"starts-with(normalize-space(.), 'http')" dc:identifier = xpath:"normalize-space(.)"; else dr:CobjIdentifier = xpath:"normalize-space(.)";
//apply xpath:"//dc:relation" if xpath:"starts-with(normalize-space(.), 'www.')" dc:identifier = xpath:"concat('http://', normalize-space(.))"; else dr:CobjIdentifier = xpath:"normalize-space(.)";
dr:CobjIdentifier = xpath:"distinct-values(//dc:identifier[not(starts-with(normalize-space(.), 'http'))][not(normalize-space(.) = ($varIdList))][not(starts-with(normalize-space(.), 'urn:nbn:') or starts-with(normalize-space(.), 'URN:NBN:'))][not(. = ($varISSN[1], $varISSN[2]))][normalize-space(.) != ''])";
dc:identifier = xpath:"($varIdUrl//value[not(starts-with(., 'www'))], $varIdUrl//value[starts-with(., 'www')]/concat('http://', .), $varIdLdpg//value, $varIdDoi//value)[1]";
dc:relation = xpath:"//dc:relation[starts-with(., 'https://doaj.org/toc/')]";
dr:dateOfCollection = xpath:"//dri:dateOfCollection";
static dr:dateOfTransformation = xpath:"current-dateTime()";
// dc:type = xpath:"//dc:type";
dc:language = Convert(xpath:"//dc:language", Languages);
//if xpath:"//dc:rights[text()='info:eu-repo/semantics/openAccess']" dc:publisher = xpath:"//dc:publisher"; else dc:publisher = skipRecord();
dc:date = xpath:"//dc:date";
oaf:dateAccepted = Convert(xpath:"descendant-or-self::dc:date", DateISO8601, "yyyy-MM-dd", "min()");
apply xpath:"//dc:date" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/date')" oaf:embargoenddate = RegExpr(xpath:"normalize-space(.)", $var0, "s/^(.*info:eu-repo\/date\/embargoEnd\/)//gmi"); else $var0 = "''";
//apply xpath:"//dc:relation" if xpath:"string-length(substring-after(normalize-space(.), 'info:eu-repo/grantAgreement/EC/FP7/')) = 6" oaf:projectid = RegExpr(xpath:"normalize-space(.)", $var1, "s/^(.*info:eu-repo\/grantAgreement\/EC\/FP7\/)//gmi"); else dc:relation = xpath:"normalize-space(.)";
//comment-js-09-10-2012 apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics')" dc:rights = empty; else dc:rights = xpath:"normalize-space(.)";
//
oaf:collectedDatasourceid = $varDatasourceid;
//
// apply xpath:"//dc:type" if xpath:"." dr:CobjCategory = Convert(xpath:"normalize-space(.)", TextTypologies); else dc:type = xpath:".";
//dr:CobjCategory = "0001";
$varCobjCategory = Convert(xpath:"//dc:type", TextTypologies);
$varSuperType = Convert(xpath:"normalize-space($varCobjCategory)", SuperTypes);
dr:CobjCategory = set($varCobjCategory, @type = $varSuperType;);
dc:type = xpath:"//dc:type";
//
// review status
$varRefereedIdntf = xpath:"(//*[string(node-name(.)) = 'dc:identifier' and matches(., '^(https?://(dx\.)?doi.org/)?10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$')]/'0001', //*[string(node-name(.)) = 'dc:relation' and matches(., '^info:eu-repo/semantics/altIdentifier/doi/10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$', 'i')]/'0001')";
$varRefereedProse = xpath:"(//*[string(node-name(.)) = 'dc:description' and matches(lower-case(.), '.*this\s*preprint\s*has\s*been\s*reviewed\s*and\s*recommended\s*by\s*peer\s*community') and contains(., '10.24072/')]/'0001', //dc:title[matches(lower-case(.), '.*\[.*peer[\s\-\._]*review\s*:.*\]\s*$')]/'0001')";
$varRefereedReltn = xpath:"(//dc:relation, //dc:identifier)[contains(., '://www.dovepress.com/') and matches(lower-case(.), '.*-peer-reviewed-(fulltext-)?article-.*')]/'0001'";
$varRefereedTitle = xpath:"//dc:title[matches(lower-case(.), '.*\[.*peer[\s\-\._]*review\s*:.*\]\s*$')]/'0001'";
$varRefereedDesct = xpath:"(//dc:description[matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*') or matches(lower-case(.), '(this|a)\s*(article|preprint)\s*(has\s*been\s*)?(peer[\-\s]*)?reviewed\s*and\s*recommended\s*by\s*peer[\-\s]*community')]/'0001')";
$varRefereed = xpath:"($varRefereedIdntf, $varRefereedProse, $varRefereedReltn, $varRefereedTitle, $varRefereedDesct)";
//if xpath:"$varRefereed" oaf:refereed = xpath:"'0001'"; else $varDummy= "''";
if xpath:"count(index-of($varRefereed, '0001')) &gt;0" oaf:refereed = xpath:"'0001'"; else $varDummy= "''";
if xpath:"count(index-of($varRefereed, '0002')) &gt;0 and count(index-of($varRefereed, '0001')) = 0" oaf:refereed = xpath:"'0002'"; else $varDummy= "''";
//
apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics')" oaf:accessrights = Convert(xpath:"normalize-space(.)", AccessRights); else dc:rights = xpath:".";
if xpath:"//dc:rights[starts-with(normalize-space(.), 'info:eu-repo/semantics')]" $var0 = "''"; else oaf:accessrights = "OPEN";
//if xpath:"count(//dc:rights) = 0" oaf:accessrights = "OPEN"; else $var0 = "''";
// oaf:accessrights = Convert(xpath:"normalize-space(//dc:rights)", AccessRights);
oaf:license = xpath:"(//dc:rights, //dc:relation)[starts-with(normalize-space(.), 'http') and (contains(., '/licenses/') or contains(., '/licence/') or contains(., '/licencias/') or contains(., '/licencia/') or contains(., '://creativecommons.org/') or contains(., '://rightsstatements.org/')) or matches(., '^CC[- ]BY([- ](NC([- ](ND|SA))?|ND|SA))([- ]\d(\.\d)?)?$', 'i')][not(contains(normalize-space(.), ' '))]/normalize-space(.)";
//
static oaf:collectedFrom = set("''", @name = $varOfficialname; , @id = $varDatasourceid;);
static oaf:hostedBy = set("''", @name = $varOfficialname; , @id = $varDatasourceid;);
//
//$varId = identifierExtract('["//dc:identifier", "//dc:relation"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/&amp;lt;&amp;gt;]*/[^\s"&amp;lt;&amp;gt;]+)');
$varIdDoi = identifierExtract('["//dc:identifier[starts-with(., \"10.\") or starts-with(., \"DOI:\") or starts-with(., \"doi:\") or (starts-with(., \"http\") and contains(., \"doi.org/\"))]", "//dc:relation[starts-with(., \"10.\") or starts-with(., \"DOI:\") or starts-with(., \"doi:\") or (starts-with(., \"http\") and contains(., \"doi.org/\"))]"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/&lt;&gt;]*/[^\s"&lt;&gt;]+)');
$varIdHdl = identifierExtract('["//dc:relation[starts-with(., \"http\") and contains(., \"://hdl.handle.net/\")][not(contains(., \"123456789\"))]"]' , xpath:"./*[local-name()='record']" , '(?!(://hdl.handle.net/))(\d.*)');
$varIdUrn = identifierExtract('["//dc:relation[starts-with(., \"urn:nbn:\") or starts-with(., \"URN:NBN:\") or (starts-with(., \"http\") and (contains(., \"://nbn-resolving.org/urn:nbn:\") or contains(., \"://nbn-resolving.de/urn/resolver.pl?urn:nbn:\") or contains(., \"://nbn-resolving.de/urn:nbn:\") or contains(., \"://resolver.obvsg.at/urn:nbn:\") or contains(., \"://urn.fi/URN:NBN:\") or contains(., \"://urn.kb.se/resolve?urn=urn:nbn:\")))]", "//dc:description[contains(., \"URN: urn:nbn:de:0114-\") or contains(., \"URN: http://nbn-resolving.de/urn:nbn:de:0114-\") or (contains(., \"URN:NBN:no-\") and //dc:identifier = \"1893-1774\")]"]' , xpath:"./*[local-name()='record']" , '((urn:nbn:|URN:NBN:).*)');
$varIdArk = identifierExtract('["//dc:relation[starts-with(normalize-space(.), \"http\") and contains(., \"/ark:\")]"]' , xpath:"./*[local-name()='record']" , '(http.*)');
$varIdPmid = identifierExtract('["//dc:relation[starts-with(., \"http\") and contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/pmid/\")]"]' , xpath:"./*[local-name()='record']" , '(http.*)');
$varIdPmc = identifierExtract('["//dc:relation[starts-with(., \"http\") and (contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/PMC\") or contains(., \"//europepmc.org/articles/PMC\"))]"]' , xpath:"./*[local-name()='record']" , '(http.*)');
$varIdHal = identifierExtract('["//dc:relation[starts-with(., \"hal-\") or starts-with(., \"halshs-\") or starts-with(., \"halsde-\") or (starts-with(., \"http\") and (contains(., \"://hal.archives-ouvertes.fr/hal\") or contains(., \"://halshs.archives-ouvertes.fr/hal\") or contains(., \"://halsde.archives-ouvertes.fr/hal\")))]"]' , xpath:"./*[local-name()='record']" , '(hal(shs|sde)?-.*)');
$varIdArxv = identifierExtract('["//dc:relation[starts-with(., \"http\") and (contains(., \"://arxiv.org/pdf/\") or contains(., \"://arxiv.org/abs/\"))]"]' , xpath:"./*[local-name()='record']" , '(\d.*)');
$varIdLdpg = identifierExtract('["//dc:identifier[starts-with(., \"https://doaj.org/article/\")]"]', xpath:"./*[local-name()='record']" , '(http.*)');
$varIdUrl = identifierExtract('["//dc:relation[starts-with(., \"http\")][not(contains(., \"://doaj.org\"))][not(contains(., \"doi.org/\"))][not(contains(., \"hdl.handle.net/\"))][not(contains(., \"://nbn-resolving.de/\") or contains(., \"://nbn-resolving.org/\") or contains(., \"://resolver.obvsg.at/\") or contains(., \"://urn.fi/URN:NBN:\") or contains(., \"://urn.kb.se/resolve\"))][not(contains(., \"://arxiv.org/pdf/\") or contains(., \"://arxiv.org/abs/\"))][not(contains(., \"://localhost/\") or contains(., \"://localhost:\"))]", "//dc:relation[starts-with(., \"www\")]"]', xpath:"./*[local-name()='record']" , '((http|www).*)');
$varIdList = xpath:"(($varIdDoi//value, $varIdHdl//value, $varIdUrn//value, $varIdArk//value, $varIdPmid//value, $varIdPmc//value, $varIdLdpg//value, $varIdUrl//value))";
// dropping/cleaning wrong DOIs, as
// 2 DOIs just different in 1 ending with . (mostly, but not exclusively, prefixed with 10.5216)
// noise stemming from odd/wrong DOI statements' formats
// DOIs with 2 prefixes
// DOI statements containing first the DOI prefix and then the DOI incl. the resolver prefix
//oaf:identifier = set(xpath:"$varId//value", @identifierType = "doi";);
//oaf:identifier = set(xpath:"$varIdDoi//value", @identifierType = "doi";);
oaf:identifier = set(xpath:"distinct-values(($varIdDoi//value[not(ends-with(., '.') and exists(index-of($varIdDoi//value, substring(., 1, string-length(.)-1))))][not(. = '10.4313/article-4')][not(lower-case(.) = ('10.30659/ijibe.2.1.171-181', '10.30659/ijibe.2.1.171', '10.26843/rencima.v8i4.149', '10.26843/rencima.v11i1.215', '10.18273/revfue.v14n2-2016002revista', '10.17061/phrp3112015', '10.21789/24222704', '10.22432/pjsr.2017.14.', '10.22432/pjsr.2017.18.02', '10.22432/pjsr.2017.18.'))][not(starts-with(., '10.1530/VAB-'))][not(starts-with(lower-case(.), '10.1155/s168761720'))][not(starts-with(., '10.15561/10.6084/') or starts-with(., '10.5935/10.19180/'))][not(starts-with(., '10.7454/jvi.v') and string-length(.) = 16)][not(starts-with(., '10.15094/0000') and string-length(.) = 16)][not(matches(., '^10\.\d*/DOI:$'))][not(starts-with(., concat(substring-before(., '/'), '/', substring-before(., '/'), '/')))][not(matches(substring-after(., '/'), '^https?://(dx.)?doi.org/.*') and starts-with(substring-after(., 'doi.org/'), substring-before(., '/')))][not(starts-with(., '10.1371/journal.') and matches(., '^10\.1371/journal\.[a-z]{4}\.\d{7}\.(eor|20050521)$'))][not(substring-before(., '/') = ('10.19183', '10.18066') and matches(., '^(10\.19183/how\.\d*\.\d*|10\.18066/revunivap\.v\d*i\d*)$'))]/lower-case(.), $varIdDoi//value[matches(substring-after(., '/'), '^https?://(dx.)?doi.org/.*') and starts-with(substring-after(., 'doi.org/'), substring-before(., '/'))]/substring-after(., 'doi.org/'), $varIdDoi//value[starts-with(., '10.1371/journal.') and matches(., '^10\.1371/journal\.[a-z]{4}\.\d{7}\.eor$')]/substring(., 1, 28), $varIdDoi//value[starts-with(., '10.15561/10.6084/') or starts-with(., '10.5935/10.19180/')]/substring-after(., '/')))", @identifierType = "doi";);
oaf:identifier = set(xpath:"distinct-values($varIdHdl//value/normalize-space(replace(., '\?locatt=view:master', '')))", @identifierType = "handle";);
oaf:identifier = set(xpath:"$varIdUrn//value", @identifierType = "urn";);
oaf:identifier = set(xpath:"distinct-values($varIdArk//value/replace(substring-after(., '/ark:'), '^/', ''))", @identifierType = "ark";);
oaf:identifier = set(xpath:"distinct-values($varIdPmid//value/replace(., 'https?://www.ncbi.nlm.nih.gov/pmc/articles/pmid/(\d+)(/.*)?', '$1'))", @identifierType = "pmid";);
oaf:identifier = set(xpath:"distinct-values($varIdPmc//value/replace(., 'https?://(www.ncbi.nlm.nih.gov/pmc|europepmc.org)/articles/(PMC\d*)([/\?].*)?', '$2'))", @identifierType = "pmcid";);
oaf:identifier = set(xpath:"distinct-values($varIdHal//value/replace(., '/document', ''))", @identifierType = "hal";);
oaf:identifier = set(xpath:"$varIdArxv//value", @identifierType = "arxiv";);
oaf:identifier = set(xpath:"$varIdLdpg//value", @identifierType = "landingPage";);
oaf:identifier = set(xpath:"($varIdUrl//value[not(starts-with(., 'www'))], $varIdUrl//value[starts-with(., 'www')]/concat('http://', .))", @identifierType = "url";);
oaf:datasourceprefix = xpath:"//oaf:datasourceprefix";
//$varJournalName = xpath:"substring-before(//dc:source, ',')";
$varJournalTitle = xpath:"(//dc:source[contains(., ', Vol ')]/substring-before(., ', Vol '), //dc:source[contains(., ', Iss ')]/substring-before(., ', Iss '))[1]";
$varVol = xpath:"//dc:source[contains(., ', Vol ')][matches(., ', Vol \d+')]/replace(substring-after(., ', Vol '), '^(\d+).*$', '$1')";
$varIss = xpath:"//dc:source[contains(., ', Iss ')][matches(., ', Iss \d+')]/replace(substring-after(., ', Iss '), '^(\d+).*$', '$1')";
$varSp = xpath:"//dc:source[contains(., ', Pp ')][matches(., ', Pp \d+-\d+')]/substring-before(substring-after(., ', Pp '), '-')";
$varEp = xpath:"//dc:source[contains(., ', Pp ')][matches(., ', Pp \d+-\d+')]/replace(substring-after(substring-after(., ', Pp '), '-'), '^(\d+).*$', '$1')";
$varISSN = xpath:"//dc:identifier[string-length() = 9 and matches(., '(\d{4})-(\d{4})')][1]";
//oaf:journal = set($varJournalName, @issn = xpath:"//dc:identifier[string-length() = 9 and matches(., '^(\d{4})-(\d{4}|\d{3}X)')][1]"; , @eissn = xpath:"//dc:identifier[string-length() = 9 and matches(., '^(\d{4})-(\d{4}|\d{3}X)')][2]";);
//oaf:journal = set($varJournalName, @issn = xpath:"//dc:identifier[string-length() = 9]";);
oaf:journal = set($varJournalTitle, @issn = xpath:"//dc:identifier[string-length() = 9 and matches(., '(\d{4})-(\d{4})')][1]";, @eissn = xpath:"//dc:identifier[string-length() = 9 and matches(., '(\d{4})-(\d{4})')][2]";, @vol = xpath:"$varVol";, @iss = xpath:"$varIss";, @sp = xpath:"$varSp";, @ep = xpath:"$varEp";);
end

View File

@ -0,0 +1,492 @@
<!-- from PROD 2021-06-14 -->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.1"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:transformExt="http://namespace.openaire.eu/java/org.apache.commons.codec.digest.DigestUtils"
xmlns:TransformationFunction="eu.dnetlib.data.collective.transformation.core.xsl.ext.TransformationFunctionProxy"
extension-element-prefixes="transformExt TransformationFunction"
exclude-result-prefixes="transformExt TransformationFunction" >
<xsl:output indent="yes" omit-xml-declaration="yes"/>
<!--
<xsl:param name="varHostedById" select="'opendoar____::908'"/>
<xsl:param name="varHostedByName" select="'Europe PubMed Central'"/>
-->
<xsl:param name="varOfficialName" />
<xsl:param name="varDsType" />
<xsl:param name="varDataSourceId" />
<xsl:param name="varFP7FundRefDOI" select="'10.13039/501100004963'"/>
<xsl:param name="varFP7OtherDOI" select="'10.13039/100011102'"/>
<xsl:param name="varH2020FundRefDOI" select="'10.13039/501100007601'"/>
<xsl:param name="varFP7" select="'corda_______::'"/>
<xsl:param name="varH2020" select="'corda__h2020::'"/>
<xsl:param name="epmcUrlPrefix" select="'http://europepmc.org/articles/'" />
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')"/>
<xsl:param name="index" select="0"/>
<xsl:param name="transDate" select="current-dateTime()"/>
<xsl:variable name="tf" select="TransformationFunction:getInstance()"/>
<xsl:variable name="year" select="format-number( ( //*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='year'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='year']), '0000')" />
<xsl:variable name="month" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='month'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='month']), '00')" />
<xsl:variable name="day" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='day'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='day']), '00')" />
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template match="/">
<record>
<xsl:apply-templates select="//*[local-name() = 'header']" />
<metadata>
<xsl:if test="not(//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0])">
<xsl:call-template name="terminate"/>
</xsl:if>
<!-- in journal.fi xml:lang of translated titles is not within the trans-title element but within the surrounding trans-title-group element (which just contains 1 trans-title element) -->
<!--
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']//*[local-name()=('article-title', 'trans-title-group')][string-length(normalize-space(.))> 0]"/>
<xsl:with-param name="targetElement" select="'dc:title'"/>
</xsl:call-template>
-->
<xsl:call-template name="title">
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name()='title-group']//*[local-name()=('article-title', 'trans-title', 'subtitle', 'trans-subtitle')]"/>
</xsl:call-template>
<xsl:call-template name="authors">
<!--
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']/*[local-name() = 'contrib'][@contrib-type='author'][not(exists(child::*:collab))]"/>
-->
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group'][@content-type='author']/*[local-name() = 'contrib']"/>
</xsl:call-template>
<!-- <xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:contributor"/>
<xsl:with-param name="targetElement" select="'dc:contributor'"/>
</xsl:call-template>
-->
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-meta']/*[local-name()=('abstract', 'trans-abstract')]"/>
<xsl:with-param name="targetElement" select="'dc:description'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-categories']//*[local-name()='subject']"/>
<xsl:with-param name="targetElement" select="'dc:subject'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='kwd-group']//*[local-name()='kwd']"/>
<xsl:with-param name="targetElement" select="'dc:subject'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='publisher']/*[local-name()='publisher-name']"/>
<xsl:with-param name="targetElement" select="'dc:publisher'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
<xsl:with-param name="targetElement" select="'dc:source'"/>
</xsl:call-template>
<xsl:element name="dc:language">
<xsl:value-of select="//*[local-name()='metadata']//*[local-name()='article']/@xml:lang" />
</xsl:element>
<xsl:element name="dc:identifier">
<xsl:value-of select="//*[local-name()='article-meta']/*[local-name()='self-uri'][contains(./@xlink:href, '/view/')]/@xlink:href" />
</xsl:element>
<xsl:element name="oaf:dateAccepted">
<!--
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day'])" />
<xsl:value-of select="TransformationFunction:Convert($tf, //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub'], 'DateISO8601', 'yyyy-MM-dd', 'min()')" />
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/replace(concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day']), '-(\d)([-$])', '-0$1$2')" />
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/
concat(./*[local-name()='year'], '-',
substring(concat('0', ./*[local-name()='month'], '1'), string-length(./*[local-name()='month']), 2), '-',
substring(concat('0', ./*[local-name()='day'], '1'), string-length(./*[local-name()='day']), 2))" />
-->
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/
concat(./*[local-name()='year'], '-',
substring(concat('0', ./*[local-name()='month'], '1'), string-length(./*[local-name()='month']) idiv 2 + 1, 2), '-',
substring(concat('0', ./*[local-name()='day'], '1'), string-length(./*[local-name()='day']) idiv 2 +1, 2))" />
</xsl:element>
<xsl:for-each select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub']">
<xsl:choose>
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4] and ./*[local-name()='month' and string-length(normalize-space(.)) = 2] and ./*[local-name()='day' and string-length(normalize-space(.)) = 2]">
<dc:date>
<xsl:value-of select="concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day'])"/>
</dc:date>
</xsl:when>
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4] and ./*[local-name()='month' and string-length(normalize-space(.)) = 2]">
<dc:date>
<xsl:value-of select="concat(./*[local-name()='year'], '-', ./*[local-name()='month'])"/>
</dc:date>
</xsl:when>
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4]">
<dc:date>
<xsl:value-of select="./*[local-name()='year']"/>
</dc:date>
</xsl:when>
</xsl:choose>
</xsl:for-each>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()='meta-value'], //*[local-name()='permissions']/*[local-name()='copyright-statement']"/>
<xsl:with-param name="targetElement" select="'dc:rights'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='permissions']/*[local-name()='license']/@xlink:href"/>
<xsl:with-param name="targetElement" select="'oaf:license'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='fn-group']//*[local-name()='fn']"/>
<xsl:with-param name="targetElement" select="'dc:relation'"/>
</xsl:call-template>
<xsl:call-template name="identifiers">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-id']"/>
</xsl:call-template>
<xsl:for-each select="//*[local-name()='article-meta']/*[local-name()='self-uri'][not(./@content-type = 'application/pdf')]/@xlink:href">
<oaf:identifier>
<xsl:attribute name="identifierType">
<xsl:text>landingPage</xsl:text>
</xsl:attribute>
<xsl:value-of select="."/>
</oaf:identifier>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='article-meta']/*[local-name()='self-uri' and ./@content-type='application/pdf' and //oaf:datasourceprefix = ('ambientesust', 'qualityinedu')]/@xlink:href/replace(., '/view/', '/download/')">
<oaf:fulltext>
<xsl:value-of select="."/>
</oaf:fulltext>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varFP7FundRefDOI) or ends-with(., $varFP7OtherDOI)]]">
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
<oaf:projectid>
<xsl:value-of select="concat($varFP7, ./*[local-name()='award-id'])"/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varH2020FundRefDOI)]]">
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
<oaf:projectid>
<xsl:value-of select="concat($varH2020, ./*[local-name()='award-id'])"/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<!-- -->
<xsl:variable name='varRights' select="distinct-values((for $i in (
//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'license']/@xlink:href,
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read'
and not( ./@start_date[(xs:date( max( (string(.), '0001-01-01') ) ) gt current-date())])
and not( ./@end_date[(xs:date( max( (string(.), '0001-01-01') ) ) lt current-date())])]/'open',
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read'
and (( ./@start_date[(xs:date( max( (string(.), '0001-01-01') ) ) gt current-date())])
or ( ./@end_date[(xs:date( max( (string(.), '0001-01-01') ) ) lt current-date())]))]/'embargo')
return TransformationFunction:convertString($tf, normalize-space($i), 'AccessRights')))" />
<!--
and not((xs:date( max( (start_date, '0001-01-01') ) ) gt current-date()))
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read' and and not((xs:date( max( (./@start_date, '0001-01-01') ) ) gt current-date()))]/'open'
-->
<oaf:accessrights>
<xsl:choose>
<xsl:when test="$varRights[. = 'EMBARGO']">
<xsl:value-of select="'EMBARGO'"/>
</xsl:when>
<xsl:when test="$varRights[. != 'UNKNOWN']">
<xsl:value-of select="$varRights[. != 'UNKNOWN'][1]"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$varRights[1]"/>
</xsl:otherwise>
</xsl:choose>
</oaf:accessrights>
<!--
<oaf:accessrights>
<xsl:value-of select="$varRights[1]"/>
</oaf:accessrights>
<xsl:element name="oaf:accessrights">
<xsl:value-of select="(//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'license']/@xlink:href)/TransformationFunction:convertString($tf, ., 'AccessRights')" />
</xsl:element>
-->
<!--
<xsl:element name="dr:CobjCategory">
<xsl:variable name='varCobjCategory' select="TransformationFunction:convertString($tf, //*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()='meta-value'], 'TextTypologies')" />
<xsl:variable name='varSuperType' select="TransformationFunction:convertString($tf, $varCobjCategory, 'SuperTypes')" />
<xsl:attribute name="type" select="$varSuperType"/>
<xsl:value-of select="$varCobjCategory" />
</xsl:element>
<xsl:variable name='varCobjCatLst' select="for $i in (
//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type)
return TransformationFunction:convertString($tf, normalize-space($i), 'TextTypologies')" />
-->
<xsl:variable name='varTypLst' select="distinct-values((//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type))"/>
<xsl:variable name='varCobjCatLst' select="distinct-values((for $i in $varTypLst
return TransformationFunction:convertString($tf, normalize-space($i), 'TextTypologies')))" />
<xsl:variable name='varCobjSupLst' select="for $i in $varCobjCatLst
return concat($i, '###', TransformationFunction:convertString($tf, normalize-space($i), 'SuperTypes'))" />
<dr:CobjCategory>
<xsl:choose>
<xsl:when test="count($varCobjSupLst[not(substring-after(., '###') = 'other') and not(substring-before(., '###') = ('0038', '0039', '0040'))]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-after(., '###') = 'other') and not(substring-before(., '###') = ('0038', '0039', '0040'))][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:when test="count($varCobjSupLst[not(substring-after(., '###') = 'other')]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-after(., '###') = 'other')][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:when test="count($varCobjSupLst[not(substring-before(., '###') = ('0020', '0000'))]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-before(., '###') = ('0020', '0000'))][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:when test="count($varCobjSupLst[not(substring-before(., '###') = ('0000'))]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-before(., '###') = ('0000'))][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:otherwise>
<xsl:attribute name="type" select="'other'"/>
<xsl:value-of select="'0000'" />
</xsl:otherwise>
</xsl:choose>
</dr:CobjCategory>
<!--
<xsl:for-each select="$varCobjSupLst">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
-->
<xsl:for-each select="$varTypLst">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
<!--
<xsl:for-each select="(//*[local-name()='article']/@article-type, //*[local-name() = 'custom-meta' and ./@specific-use = 'resource-type']/*[local-name() = ('meta-value', 'meta-name')])">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
-->
<oaf:language>
<xsl:value-of select="TransformationFunction:convertString($tf, //*[local-name()='metadata']//*[local-name()='article']/@xml:lang, 'Languages')" />
</oaf:language>
<!-- review status -->
<!-- ToDo:
review status
~ ask Journal.fi to put it elsewhere
~ evaluate article-version (no example found yet)
subject/kwd:
~ handle thesauri (no example found yet)
relations:
~ handle fn (no example found yet)
-->
<!--
<xsl:variable name="varRefereedConvt" select="for $i in (
//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type)
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
-->
<xsl:variable name="varRefereedConvt" select="for $i in ($varTypLst)
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
<xsl:variable name="varRefereedDescp" select="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]/'0001'"/>
<xsl:variable name="varRefereedSubjt" select="//*[local-name() = 'article-categories' and contains(//dri:recordIdentifier, 'oai:journal.fi')]/*[local-name() = 'subj-group' and ./@subj-group-type='heading']/*[local-name() = 'subject' and . = 'Peer reviewed articles']/'0001'"/>
<xsl:variable name="varRefereed" select="($varRefereedConvt, $varRefereedDescp, $varRefereedSubjt)"/>
<!--
<oaf:refereed>
<xsl:value-of select="$varRefereedDescp"/>
</oaf:refereed>
<oaf:refereed>
<xsl:value-of select="$varRefereed"/>
</oaf:refereed>
<oaf:refereed>
<xsl:value-of select="count($varRefereed[. = '0001']) > 0"/>
</oaf:refereed>
-->
<xsl:choose>
<xsl:when test="count($varRefereed[. = '0001']) > 0">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="count($varRefereed[. = '0002']) > 0">
<oaf:refereed>
<xsl:value-of select="'0002'"/>
</oaf:refereed>
</xsl:when>
</xsl:choose>
<xsl:call-template name="journal">
<xsl:with-param name="journalTitle" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
<xsl:with-param name="issn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='ppub']"/>
<xsl:with-param name="eissn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='epub']"/>
<xsl:with-param name="vol" select="//*[local-name()='article-meta']/*[local-name()='volume']"/>
<xsl:with-param name="issue" select="//*[local-name()='article-meta']/*[local-name()='issue']"/>
<xsl:with-param name="sp" select="//*[local-name()='article-meta']/*[local-name()='fpage']"/>
<xsl:with-param name="ep" select="//*[local-name()='article-meta']/*[local-name()='lpage']"/>
</xsl:call-template>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:collectedFrom>
</metadata>
<xsl:copy-of select="//*[local-name() = 'about']" />
</record>
</xsl:template>
<xsl:template name="allElements">
<xsl:param name="sourceElement"/>
<xsl:param name="targetElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="{$targetElement}">
<xsl:if test="(.[@xml:lang] or ..[@xml:lang]) and $targetElement = ('dc:title', 'dc:description', 'dc:subject')">
<xsl:attribute name="xml:lang">
<xsl:value-of select="(./@xml:lang, ../@xml:lang)[1]"/>
</xsl:attribute>
</xsl:if>
<xsl:value-of select="normalize-space(.)"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template name="title">
<xsl:param name="sourceElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="dc:title">
<xsl:if test=".[@xml:lang] or ..[@xml:lang]">
<xsl:attribute name="xml:lang">
<xsl:value-of select="(./@xml:lang, ../@xml:lang)[1]"/>
</xsl:attribute>
</xsl:if>
<xsl:value-of select="string-join((., ./following-sibling::*[local-name() = ('subtitle', 'trans-subtitle')])/normalize-space(.), ': ')"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template name="journal">
<xsl:param name="journalTitle"/>
<xsl:param name="issn"/>
<xsl:param name="eissn"/>
<xsl:param name="vol"/>
<xsl:param name="issue"/>
<xsl:param name="sp"/>
<xsl:param name="ep"/>
<xsl:element name="oaf:journal">
<xsl:attribute name="issn">
<xsl:value-of select="normalize-space($issn)"/>
</xsl:attribute>
<xsl:attribute name="eissn">
<xsl:value-of select="normalize-space($eissn)"/>
</xsl:attribute>
<xsl:attribute name="vol">
<xsl:value-of select="normalize-space($vol)"/>
</xsl:attribute>
<xsl:attribute name="iss">
<xsl:value-of select="normalize-space($issue)"/>
</xsl:attribute>
<xsl:attribute name="sp">
<xsl:value-of select="normalize-space($sp)"/>
</xsl:attribute>
<xsl:attribute name="ep">
<xsl:value-of select="normalize-space($ep)"/>
</xsl:attribute>
<xsl:value-of select="normalize-space($journalTitle)"/>
</xsl:element>
</xsl:template>
<xsl:template name="identifiers">
<xsl:param name="sourceElement"/>
<xsl:if test="string-length($sourceElement[@pub-id-type='doi']) gt 0">
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>doi</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type='doi']"/>
</xsl:element>
</xsl:if>
</xsl:template>
<xsl:template name="authors">
<xsl:param name="sourceElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="dc:creator">
<xsl:if test="./*[local-name()='contrib-id'][@contrib-id-type='orcid']">
<xsl:attribute name="nameIdentifierScheme">
<xsl:text>ORCID</xsl:text>
</xsl:attribute>
<xsl:attribute name="schemeURI">
<xsl:text>http://orcid.org/</xsl:text>
</xsl:attribute>
<xsl:attribute name="nameIdentifier">
<xsl:value-of select="substring-after(./*[local-name()='contrib-id'][@contrib-id-type='orcid'], 'http://orcid.org/')"/>
</xsl:attribute>
</xsl:if>
<xsl:value-of select="concat(normalize-space(./*[local-name()='name']/*[local-name()='surname']), ', ', normalize-space(./*[local-name()='name']/*[local-name()='given-names']))"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template match="//*[local-name() = 'header']">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate"/>
</xsl:element>
</xsl:copy>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>

View File

@ -0,0 +1,437 @@
<!-- from production 2021-0614 -->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.1"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:transformExt="http://namespace.openaire.eu/java/org.apache.commons.codec.digest.DigestUtils"
xmlns:TransformationFunction="eu.dnetlib.data.collective.transformation.core.xsl.ext.TransformationFunctionProxy"
extension-element-prefixes="transformExt TransformationFunction"
exclude-result-prefixes="transformExt TransformationFunction" >
<xsl:output indent="yes" omit-xml-declaration="yes"/>
<xsl:param name="varHostedById" select="'opendoar____::908'"/>
<xsl:param name="varHostedByName" select="'Europe PubMed Central'"/>
<xsl:param name="varOfficialName" />
<xsl:param name="varDsType" />
<xsl:param name="varDataSourceId" />
<xsl:param name="varFP7FundRefDOI" select="'10.13039/501100004963'"/>
<xsl:param name="varH2020FundRefDOI" select="'10.13039/501100007601'"/>
<xsl:param name="varFP7" select="'corda_______::'"/>
<xsl:param name="varH2020" select="'corda__h2020::'"/>
<xsl:param name="epmcUrlPrefix" select="'http://europepmc.org/articles/'" />
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')"/>
<xsl:param name="index" select="0"/>
<xsl:param name="transDate" select="current-dateTime()"/>
<xsl:variable name="tf" select="TransformationFunction:getInstance()"/>
<xsl:variable name="year" select="format-number( ( //*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='year'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='year']), '0000')" />
<xsl:variable name="month" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='month'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='month']), '00')" />
<xsl:variable name="day" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='day'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='day']), '00')" />
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template match="/">
<record>
<xsl:apply-templates select="//*[local-name() = 'header']" />
<metadata>
<xsl:if test="not(//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0])">
<xsl:call-template name="terminate"/>
</xsl:if>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0]"/>
<xsl:with-param name="targetElement" select="'dc:title'"/>
</xsl:call-template>
<xsl:call-template name="authors">
<!--
<xsl:with-param name="sourceElement" select="//*[local-name() = 'contrib'][@contrib-type='author']"/>
-->
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']/*[local-name() = 'contrib'][@contrib-type='author'][not(exists(child::*:collab))][./*[local-name()='name'] or ./*[local-name()='name-alternatives']/*[local-name()='name']][string-length(.//*[local-name()='surname']) + string-length(.//*[local-name()='given-names']) > 0]"/>
</xsl:call-template>
<!-- <xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:contributor"/>
<xsl:with-param name="targetElement" select="'dc:contributor'"/>
</xsl:call-template>
-->
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-meta']/*[local-name()='abstract']"/>
<xsl:with-param name="targetElement" select="'dc:description'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-categories']//*[local-name()='subject']"/>
<xsl:with-param name="targetElement" select="'dc:subject'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='kwd-group' and not(lower-case(@kwd-group-type)=('mesh', 'ocis'))]//*[local-name()='kwd']"/>
<xsl:with-param name="targetElement" select="'dc:subject'"/>
</xsl:call-template>
<xsl:for-each select="//*[local-name()='kwd-group' and lower-case(@kwd-group-type)='mesh' and ./*[local-name()='kwd']]">
<xsl:for-each select="./*[local-name()='kwd']">
<dc:subject>
<xsl:attribute name="subjectScheme" select="'mesh'"/>
<xsl:attribute name="schemeURI" select="'http://www.nlm.nih.gov/mesh/'"/>
<xsl:attribute name="valueURI" select="''"/>
<xsl:value-of select="./concat('mesh:', replace(., 'mesh (.*)$', '$1'))"/>
</dc:subject>
</xsl:for-each>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='kwd-group' and lower-case(@kwd-group-type)='ocis' and ./*[local-name()='kwd']]">
<xsl:for-each select="./*[local-name()='kwd']">
<dc:subject>
<xsl:attribute name="subjectScheme" select="'ocis'"/>
<xsl:attribute name="schemeURI" select="''"/>
<xsl:attribute name="valueURI" select="''"/>
<xsl:value-of select="./concat('ocis:', .)"/>
</dc:subject>
</xsl:for-each>
</xsl:for-each>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='publisher']/*[local-name()='publisher-name']"/>
<xsl:with-param name="targetElement" select="'dc:publisher'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
<xsl:with-param name="targetElement" select="'dc:source'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/(*[local-name() = 'article-version-alternatives']/*[local-name() = 'article-version'], *[local-name() = 'article-version'])/concat('article-version (', @article-version-type, ') ', .)"/>
<xsl:with-param name="targetElement" select="'dc:source'"/>
</xsl:call-template>
<xsl:element name="dc:language">
<xsl:text>eng</xsl:text>
</xsl:element>
<xsl:element name="dc:identifier">
<xsl:value-of select="concat($epmcUrlPrefix, //*[local-name()='article-id'][@pub-id-type='pmcid'])" />
</xsl:element>
<xsl:element name="oaf:fulltext">
<xsl:value-of select="concat($epmcUrlPrefix, //*[local-name()='article-id'][@pub-id-type='pmcid'])" />
</xsl:element>
<xsl:element name="oaf:dateAccepted">
<xsl:choose>
<xsl:when test="//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub'] or //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']" >
<xsl:if test="string(number($month)) eq 'NaN'" >
<xsl:value-of select="concat($year, '-', '01', '-', '01')" />
</xsl:if>
<xsl:if test="string(number($month)) != 'NaN'" >
<xsl:value-of select="concat($year, '-', $month, '-', '01')" />
</xsl:if>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="concat(//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='ppub']/*[local-name()='year'], '-01-01')" />
</xsl:otherwise>
</xsl:choose>
</xsl:element>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="distinct-values(//*[local-name()='permissions']/*[local-name()='copyright-statement'])"/>
<xsl:with-param name="targetElement" select="'dc:rights'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="distinct-values(//*[local-name()='permissions']/*[local-name()='license'])"/>
<xsl:with-param name="targetElement" select="'dc:rights'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='fn-group']//*[local-name()='fn']"/>
<xsl:with-param name="targetElement" select="'dc:relation'"/>
</xsl:call-template>
<xsl:call-template name="identifiers">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-id']"/>
</xsl:call-template>
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varFP7FundRefDOI)]]">
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
<oaf:projectid>
<xsl:value-of select="concat($varFP7, ./*[local-name()='award-id'])"/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varH2020FundRefDOI)]]">
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
<oaf:projectid>
<xsl:value-of select="concat($varH2020, ./*[local-name()='award-id'])"/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:element name="oaf:accessrights">
<xsl:text>OPEN</xsl:text>
</xsl:element>
<xsl:element name="dr:CobjCategory">
<xsl:attribute name="type" select="'publication'"/>
<xsl:text>0001</xsl:text>
</xsl:element>
<dc:type>
<xsl:value-of select="//*[local-name() = 'article']/@article-type"/>
</dc:type>
<!-- custom-meta perhaps not used for types, then drop
<xsl:variable name='varTypLst' select="distinct-values((//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type))"/>
<xsl:variable name='varTypLst' select="//*[local-name() = 'article']/@article-type"/>
-->
<!-- perhaps ensure that file indeed exists, e.g. as pdf etc -->
<!--
// reduce load for the big PubMed records by exchanging variables with choose
<xsl:variable name="varRefereedConvt" select="for $i in distinct-values((//*[local-name() = 'article']/@article-type, //oai:setSpec))
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
<xsl:variable name="varRefereedDescp" select="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]/'0001'"/>
<xsl:variable name="varRefereedFnote" select="//*[local-name() = 'article']/*[local-name() = ('back', 'front')]/*[local-name() = ('fn-group', 'notes')][
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*') or
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*review\s*information.*') or
matches(lower-case(.), '.*the\s*peer[\.\-_/\s\(\)]*review\s*history\s*for\s*this\s*article\s*is\s*available\s*at .*') or
matches(lower-case(.), '.*provenance\s*and\s*peer[\.\-_/\s\(\)]*review.*') or
matches(lower-case(.), '.*externally\s*peer[\.\-_/\s\(\)]*reviewed.*') or
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*reviewed\s*by.*') or
matches(lower-case(.), '.*refereed\s*anonymously.*') or
matches(lower-case(.), '.*peer\s*reviewer\s*reports\s*are\s*available.*')
]/'0001'"/>
<xsl:variable name="varRefereedReviw" select="//*[local-name() = ('article-meta', 'app', 'app-group')]/*[local-name() = 'supplementary-material']/*[local-name() = 'media'][
matches(lower-case(.), '.*peer\s*review\s*file.*')]/'0001'"/>
<xsl:variable name="varRefereedReltn" select="//*[local-name() = ('related-article')][./@related-article-type = ('peer-reviewed-article', 'reviewed-article')]/'0002'"/>
<xsl:variable name="varRefereedCtRol" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']
[./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or
./*[local-name() = 'contrib'][./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or ./*[local-name() = 'role' and lower-case(.) = ('reviewer', 'solicited external reviewer')] or ./@contrib-type/lower-case(.) = 'reviewer']]/'0001'"/>
<xsl:variable name="varRefereedVersn" select="//*[local-name() = 'article-meta'][./*[local-name() = 'article-version-alternatives']/*[local-name() = 'article-version' and . = 'preprint'] or ./*[local-name() = 'article-version' and . = 'preprint']]/'0002'"/>
<xsl:variable name="varRefereed" select="($varRefereedConvt, $varRefereedDescp, $varRefereedFnote, $varRefereedReviw, $varRefereedReltn, $varRefereedCtRol, $varRefereedVersn)"/>
<xsl:choose>
<xsl:when test="count($varRefereed[. = '0001']) > 0">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="count($varRefereed[. = '0002']) > 0">
<oaf:refereed>
<xsl:value-of select="'0002'"/>
</oaf:refereed>
</xsl:when>
</xsl:choose>
-->
<xsl:variable name="varRefereedConvt" select="for $i in distinct-values((//*[local-name() = 'article']/@article-type, //oai:setSpec))
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
<xsl:choose>
<xsl:when test="count($varRefereedConvt[. = '0001']) > 0">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = 'article-meta']/*[local-name() = 'article-id'][@pub-id-type='doi'][matches(., '^(https?://(dx\.)?doi.org/)?10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$')]">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = 'article']/*[local-name() = ('back', 'front')]/*[local-name() = ('fn-group', 'notes')][
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*') or
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*review\s*information.*') or
matches(lower-case(.), '.*the\s*peer[\.\-_/\s\(\)]*review\s*history\s*for\s*this\s*article\s*is\s*available\s*at .*') or
matches(lower-case(.), '.*provenance\s*and\s*peer[\.\-_/\s\(\)]*review.*') or
matches(lower-case(.), '.*externally\s*peer[\.\-_/\s\(\)]*reviewed.*') or
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*reviewed\s*by.*') or
matches(lower-case(.), '.*refereed\s*anonymously.*') or
matches(lower-case(.), '.*peer\s*reviewer\s*reports\s*are\s*available.*') or
matches(lower-case(.), '.*\[.*peer[\s\-\._]*review\s*:.*\].*') or
matches(lower-case(.), '.*\[.*referees\s*:.*\].*') or
matches(lower-case(.), '^\s*plagiarism[\s\-\._]check.*') or
matches(lower-case(.), '^\s*peer[\s\-\._]*review.*') or
matches(lower-case(.), '^\s*(open\s*peer[\s\-\._]*|p-)reviewer.*') or
matches(lower-case(.), '^\s*(open\s*peer[\s\-\._]*|p-)review\s*reports?.*')]">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = ('article-meta', 'app', 'app-group')]/*[local-name() = 'supplementary-material']/*[local-name() = 'media'][
matches(lower-case(.), '.*peer\s*review\s*file.*')]">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']
[./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or
./*[local-name() = 'contrib'][./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or ./*[local-name() = 'role' and lower-case(.) = ('reviewer', 'solicited external reviewer')] or ./@contrib-type/lower-case(.) = 'reviewer']]">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="count($varRefereedConvt[. = '0002']) > 0">
<oaf:refereed>
<xsl:value-of select="'0002'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = ('related-article')][./@related-article-type = ('peer-reviewed-article', 'reviewed-article')]">
<oaf:refereed>
<xsl:value-of select="'0002'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = 'article-meta'][./*[local-name() = 'article-version-alternatives']/*[local-name() = 'article-version' and . = 'preprint'] or ./*[local-name() = 'article-version' and . = 'preprint']]">
<oaf:refereed>
<xsl:value-of select="'0002'"/>
</oaf:refereed>
</xsl:when>
</xsl:choose>
<xsl:call-template name="journal">
<xsl:with-param name="journalTitle" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
<xsl:with-param name="issn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='ppub']"/>
<xsl:with-param name="eissn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='epub']"/>
<xsl:with-param name="vol" select="//*[local-name()='article-meta']/*[local-name()='volume']"/>
<xsl:with-param name="issue" select="//*[local-name()='article-meta']/*[local-name()='issue']"/>
<xsl:with-param name="sp" select="//*[local-name()='article-meta']/*[local-name()='fpage']"/>
<xsl:with-param name="ep" select="//*[local-name()='article-meta']/*[local-name()='lpage']"/>
</xsl:call-template>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="$varHostedByName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varHostedById"/>
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:collectedFrom>
<xsl:for-each select="//*[local-name() = 'article']/*[local-name() = ('back', 'front')]/*[local-name() = 'fn-group']/*[local-name() = 'fn'][matches(lower-case(.), 'country(/territory)? of origin:?\s*[A-Za-z\-]+')]">
<oaf:country>
<!--
<xsl:value-of select="TransformationFunction:convertString($tf, replace(lower-case(.), '^(.|\s)*country(/territory)? of origin:?\s+([A-Za-z\-,\(\)]+(\s+[A-Za-z\-,\(\)]+)*)(.|\s)*$', '$3'), 'Countries')"/>
-->
<xsl:value-of select="TransformationFunction:convertString($tf, normalize-space(substring(substring-after(lower-case(.), 'of origin'), 2)), 'Countries')"/>
</oaf:country>
</xsl:for-each>
</metadata>
<xsl:copy-of select="//*[local-name() = 'about']" />
</record>
</xsl:template>
<xsl:template name="allElements">
<xsl:param name="sourceElement"/>
<xsl:param name="targetElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="{$targetElement}">
<xsl:value-of select="normalize-space(.)"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template name="journal">
<xsl:param name="journalTitle"/>
<xsl:param name="issn"/>
<xsl:param name="eissn"/>
<xsl:param name="vol"/>
<xsl:param name="issue"/>
<xsl:param name="sp"/>
<xsl:param name="ep"/>
<xsl:element name="oaf:journal">
<xsl:attribute name="issn">
<xsl:value-of select="normalize-space($issn)"/>
</xsl:attribute>
<xsl:attribute name="eissn">
<xsl:value-of select="normalize-space($eissn)"/>
</xsl:attribute>
<xsl:attribute name="vol">
<xsl:value-of select="normalize-space($vol)"/>
</xsl:attribute>
<xsl:attribute name="iss">
<xsl:value-of select="normalize-space($issue)"/>
</xsl:attribute>
<xsl:attribute name="sp">
<xsl:value-of select="normalize-space($sp)"/>
</xsl:attribute>
<xsl:attribute name="ep">
<xsl:value-of select="normalize-space($ep)"/>
</xsl:attribute>
<xsl:value-of select="normalize-space($journalTitle)"/>
</xsl:element>
</xsl:template>
<xsl:template name="identifiers">
<xsl:param name="sourceElement"/>
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>doi</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type='doi']"/>
</xsl:element>
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>pmc</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type='pmcid']"/>
</xsl:element>
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>pmid</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type='pmid']"/>
</xsl:element>
</xsl:template>
<xsl:template name="authors">
<xsl:param name="sourceElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="dc:creator">
<xsl:if test="./*[local-name()='contrib-id'][@contrib-id-type='orcid']">
<xsl:attribute name="nameIdentifierScheme">
<xsl:text>ORCID</xsl:text>
</xsl:attribute>
<xsl:attribute name="schemeURI">
<xsl:text>http://orcid.org/</xsl:text>
</xsl:attribute>
<xsl:attribute name="nameIdentifier">
<xsl:value-of select="substring-after(./*[local-name()='contrib-id'][@contrib-id-type='orcid'], 'http://orcid.org/')"/>
</xsl:attribute>
</xsl:if>
<!--
<xsl:value-of select="concat(normalize-space(./*[local-name()='name']/*[local-name()='surname']), ', ', normalize-space(./*[local-name()='name']/*[local-name()='given-names']))"/>
-->
<xsl:value-of select="concat(normalize-space(./(*[local-name()='name'], *[local-name()='name-alternatives']/*[local-name()='name'])/*[local-name()='surname']), ', ', normalize-space(./(*[local-name()='name'], *[local-name()='name-alternatives']/*[local-name()='name'])/*[local-name()='given-names']))"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template match="//*[local-name() = 'header']">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate"/>
</xsl:element>
</xsl:copy>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>

View File

@ -0,0 +1,493 @@
<!-- from PROD 2021-06-14 -->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:vocabulary="http://eu/dnetlib/transform/clean"
xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
exclude-result-prefixes="xsl vocabulary dateCleaner"
version="2.0">
<!--
<xsl:param name="varHostedById" select="'opendoar____::908'"/>
<xsl:param name="varHostedByName" select="'Europe PubMed Central'"/>
-->
<xsl:param name="varOfficialName" />
<xsl:param name="varDsType" />
<xsl:param name="varDataSourceId" />
<xsl:param name="varFP7FundRefDOI" select="'10.13039/501100004963'"/>
<xsl:param name="varFP7OtherDOI" select="'10.13039/100011102'"/>
<xsl:param name="varH2020FundRefDOI" select="'10.13039/501100007601'"/>
<xsl:param name="varFP7" select="'corda_______::'"/>
<xsl:param name="varH2020" select="'corda__h2020::'"/>
<xsl:param name="epmcUrlPrefix" select="'http://europepmc.org/articles/'" />
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')"/>
<xsl:param name="index" select="0"/>
<xsl:param name="transDate" select="current-dateTime()"/>
<xsl:variable name="year" select="format-number( ( //*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='year'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='year']), '0000')" />
<xsl:variable name="month" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='month'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='month']), '00')" />
<xsl:variable name="day" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='day'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='day']), '00')" />
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template match="/">
<record>
<xsl:apply-templates select="//*[local-name() = 'header']" />
<metadata>
<xsl:if test="not(//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0])">
<xsl:call-template name="terminate"/>
</xsl:if>
<!-- in journal.fi xml:lang of translated titles is not within the trans-title element but within the surrounding trans-title-group element (which just contains 1 trans-title element) -->
<!--
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']//*[local-name()=('article-title', 'trans-title-group')][string-length(normalize-space(.))> 0]"/>
<xsl:with-param name="targetElement" select="'dc:title'"/>
</xsl:call-template>
-->
<xsl:call-template name="title">
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name()='title-group']//*[local-name()=('article-title', 'trans-title', 'subtitle', 'trans-subtitle')]"/>
</xsl:call-template>
<xsl:call-template name="authors">
<!--
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']/*[local-name() = 'contrib'][@contrib-type='author'][not(exists(child::*:collab))]"/>
-->
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group'][@content-type='author']/*[local-name() = 'contrib']"/>
</xsl:call-template>
<!-- <xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:contributor"/>
<xsl:with-param name="targetElement" select="'dc:contributor'"/>
</xsl:call-template>
-->
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-meta']/*[local-name()=('abstract', 'trans-abstract')]"/>
<xsl:with-param name="targetElement" select="'dc:description'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-categories']//*[local-name()='subject']"/>
<xsl:with-param name="targetElement" select="'dc:subject'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='kwd-group']//*[local-name()='kwd']"/>
<xsl:with-param name="targetElement" select="'dc:subject'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='publisher']/*[local-name()='publisher-name']"/>
<xsl:with-param name="targetElement" select="'dc:publisher'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
<xsl:with-param name="targetElement" select="'dc:source'"/>
</xsl:call-template>
<xsl:element name="dc:language">
<xsl:value-of select="//*[local-name()='metadata']//*[local-name()='article']/@xml:lang" />
</xsl:element>
<xsl:element name="dc:identifier">
<xsl:value-of select="//*[local-name()='article-meta']/*[local-name()='self-uri'][contains(./@xlink:href, '/view/')]/@xlink:href" />
</xsl:element>
<xsl:element name="oaf:dateAccepted">
<!--
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day'])" />
<xsl:value-of select="TransformationFunction:Convert($tf, //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub'], 'DateISO8601', 'yyyy-MM-dd', 'min()')" />
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/replace(concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day']), '-(\d)([-$])', '-0$1$2')" />
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/
concat(./*[local-name()='year'], '-',
substring(concat('0', ./*[local-name()='month'], '1'), string-length(./*[local-name()='month']), 2), '-',
substring(concat('0', ./*[local-name()='day'], '1'), string-length(./*[local-name()='day']), 2))" />
-->
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/
concat(./*[local-name()='year'], '-',
substring(concat('0', ./*[local-name()='month'], '1'), string-length(./*[local-name()='month']) idiv 2 + 1, 2), '-',
substring(concat('0', ./*[local-name()='day'], '1'), string-length(./*[local-name()='day']) idiv 2 +1, 2))" />
</xsl:element>
<xsl:for-each select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub']">
<xsl:choose>
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4] and ./*[local-name()='month' and string-length(normalize-space(.)) = 2] and ./*[local-name()='day' and string-length(normalize-space(.)) = 2]">
<dc:date>
<xsl:value-of select="concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day'])"/>
</dc:date>
</xsl:when>
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4] and ./*[local-name()='month' and string-length(normalize-space(.)) = 2]">
<dc:date>
<xsl:value-of select="concat(./*[local-name()='year'], '-', ./*[local-name()='month'])"/>
</dc:date>
</xsl:when>
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4]">
<dc:date>
<xsl:value-of select="./*[local-name()='year']"/>
</dc:date>
</xsl:when>
</xsl:choose>
</xsl:for-each>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()='meta-value'], //*[local-name()='permissions']/*[local-name()='copyright-statement']"/>
<xsl:with-param name="targetElement" select="'dc:rights'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='permissions']/*[local-name()='license']/@xlink:href"/>
<xsl:with-param name="targetElement" select="'oaf:license'"/>
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()='fn-group']//*[local-name()='fn']"/>
<xsl:with-param name="targetElement" select="'dc:relation'"/>
</xsl:call-template>
<xsl:call-template name="identifiers">
<xsl:with-param name="sourceElement" select="//*[local-name()='article-id']"/>
</xsl:call-template>
<xsl:for-each select="//*[local-name()='article-meta']/*[local-name()='self-uri'][not(./@content-type = 'application/pdf')]/@xlink:href">
<oaf:identifier>
<xsl:attribute name="identifierType">
<xsl:text>landingPage</xsl:text>
</xsl:attribute>
<xsl:value-of select="."/>
</oaf:identifier>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='article-meta']/*[local-name()='self-uri' and ./@content-type='application/pdf' and //oaf:datasourceprefix = ('ambientesust', 'qualityinedu')]/@xlink:href/replace(., '/view/', '/download/')">
<oaf:fulltext>
<xsl:value-of select="."/>
</oaf:fulltext>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varFP7FundRefDOI) or ends-with(., $varFP7OtherDOI)]]">
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
<oaf:projectid>
<xsl:value-of select="concat($varFP7, ./*[local-name()='award-id'])"/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varH2020FundRefDOI)]]">
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
<oaf:projectid>
<xsl:value-of select="concat($varH2020, ./*[local-name()='award-id'])"/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<!-- -->
<xsl:variable name='varRights' select="distinct-values((for $i in (
//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'license']/@xlink:href,
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read'
and not( ./@start_date[(xs:date( max( (string(.), '0001-01-01') ) ) gt current-date())])
and not( ./@end_date[(xs:date( max( (string(.), '0001-01-01') ) ) lt current-date())])]/'open',
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read'
and (( ./@start_date[(xs:date( max( (string(.), '0001-01-01') ) ) gt current-date())])
or ( ./@end_date[(xs:date( max( (string(.), '0001-01-01') ) ) lt current-date())]))]/'embargo')
return vocabulary:clean( normalize-space($i), 'dnet:access_modes') "
/>
<!--
and not((xs:date( max( (start_date, '0001-01-01') ) ) gt current-date()))
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read' and and not((xs:date( max( (./@start_date, '0001-01-01') ) ) gt current-date()))]/'open'
-->
<oaf:accessrights>
<xsl:choose>
<xsl:when test="$varRights[. = 'EMBARGO']">
<xsl:value-of select="'EMBARGO'"/>
</xsl:when>
<xsl:when test="$varRights[. != 'UNKNOWN']">
<xsl:value-of select="$varRights[. != 'UNKNOWN'][1]"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$varRights[1]"/>
</xsl:otherwise>
</xsl:choose>
</oaf:accessrights>
<!--
<oaf:accessrights>
<xsl:value-of select="$varRights[1]"/>
</oaf:accessrights>
<xsl:element name="oaf:accessrights">
<xsl:value-of select="(//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'license']/@xlink:href)/TransformationFunction:convertString($tf, ., 'AccessRights')" />
</xsl:element>
-->
<!--
<xsl:element name="dr:CobjCategory">
<xsl:variable name='varCobjCategory' select="TransformationFunction:convertString($tf, //*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()='meta-value'], 'TextTypologies')" />
<xsl:variable name='varSuperType' select="TransformationFunction:convertString($tf, $varCobjCategory, 'SuperTypes')" />
<xsl:attribute name="type" select="$varSuperType"/>
<xsl:value-of select="$varCobjCategory" />
</xsl:element>
<xsl:variable name='varCobjCatLst' select="for $i in (
//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type)
return TransformationFunction:convertString($tf, normalize-space($i), 'TextTypologies')" />
-->
<xsl:variable name='varTypLst' select="distinct-values((//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type))"/>
<xsl:variable name='varCobjCatLst' select="distinct-values((for $i in $varTypLst
return vocabulary:clean( normalize-space($i), 'dnet:dnet:publication_resource')))" />
<xsl:variable name='varCobjSupLst' select="for $i in $varCobjCatLst
return concat($i, '###', vocabulary:clean( normalize-space($i), 'dnet:result_typologies'))" />
<dr:CobjCategory>
<xsl:choose>
<xsl:when test="count($varCobjSupLst[not(substring-after(., '###') = 'other') and not(substring-before(., '###') = ('0038', '0039', '0040'))]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-after(., '###') = 'other') and not(substring-before(., '###') = ('0038', '0039', '0040'))][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:when test="count($varCobjSupLst[not(substring-after(., '###') = 'other')]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-after(., '###') = 'other')][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:when test="count($varCobjSupLst[not(substring-before(., '###') = ('0020', '0000'))]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-before(., '###') = ('0020', '0000'))][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:when test="count($varCobjSupLst[not(substring-before(., '###') = ('0000'))]) > 0">
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-before(., '###') = ('0000'))][1]" />
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
<xsl:value-of select="substring-before($varCobjSup, '###')" />
</xsl:when>
<xsl:otherwise>
<xsl:attribute name="type" select="'other'"/>
<xsl:value-of select="'0000'" />
</xsl:otherwise>
</xsl:choose>
</dr:CobjCategory>
<!--
<xsl:for-each select="$varCobjSupLst">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
-->
<xsl:for-each select="$varTypLst">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
<!--
<xsl:for-each select="(//*[local-name()='article']/@article-type, //*[local-name() = 'custom-meta' and ./@specific-use = 'resource-type']/*[local-name() = ('meta-value', 'meta-name')])">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
-->
<oaf:language>
<xsl:value-of select="vocabulary:clean( //*[local-name()='metadata']//*[local-name()='article']/@xml:lang, 'dnet:languages')" />
</oaf:language>
<!-- review status -->
<!-- ToDo:
review status
~ ask Journal.fi to put it elsewhere
~ evaluate article-version (no example found yet)
subject/kwd:
~ handle thesauri (no example found yet)
relations:
~ handle fn (no example found yet)
-->
<!--
<xsl:variable name="varRefereedConvt" select="for $i in (
//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
//*[local-name() = 'article']/@article-type)
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
-->
<xsl:variable name="varRefereedConvt" select="for $i in ($varTypLst)
return vocabulary:clean( normalize-space($i), 'dnet:review_levels')"/>
<xsl:variable name="varRefereedDescp" select="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]/'0001'"/>
<xsl:variable name="varRefereedSubjt" select="//*[local-name() = 'article-categories' and contains(//dri:recordIdentifier, 'oai:journal.fi')]/*[local-name() = 'subj-group' and ./@subj-group-type='heading']/*[local-name() = 'subject' and . = 'Peer reviewed articles']/'0001'"/>
<xsl:variable name="varRefereed" select="($varRefereedConvt, $varRefereedDescp, $varRefereedSubjt)"/>
<!--
<oaf:refereed>
<xsl:value-of select="$varRefereedDescp"/>
</oaf:refereed>
<oaf:refereed>
<xsl:value-of select="$varRefereed"/>
</oaf:refereed>
<oaf:refereed>
<xsl:value-of select="count($varRefereed[. = '0001']) > 0"/>
</oaf:refereed>
-->
<xsl:choose>
<xsl:when test="count($varRefereed[. = '0001']) > 0">
<oaf:refereed>
<xsl:value-of select="'0001'"/>
</oaf:refereed>
</xsl:when>
<xsl:when test="count($varRefereed[. = '0002']) > 0">
<oaf:refereed>
<xsl:value-of select="'0002'"/>
</oaf:refereed>
</xsl:when>
</xsl:choose>
<xsl:call-template name="journal">
<xsl:with-param name="journalTitle" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
<xsl:with-param name="issn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='ppub']"/>
<xsl:with-param name="eissn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='epub']"/>
<xsl:with-param name="vol" select="//*[local-name()='article-meta']/*[local-name()='volume']"/>
<xsl:with-param name="issue" select="//*[local-name()='article-meta']/*[local-name()='issue']"/>
<xsl:with-param name="sp" select="//*[local-name()='article-meta']/*[local-name()='fpage']"/>
<xsl:with-param name="ep" select="//*[local-name()='article-meta']/*[local-name()='lpage']"/>
</xsl:call-template>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:collectedFrom>
</metadata>
<xsl:copy-of select="//*[local-name() = 'about']" />
</record>
</xsl:template>
<xsl:template name="allElements">
<xsl:param name="sourceElement"/>
<xsl:param name="targetElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="{$targetElement}">
<xsl:if test="(.[@xml:lang] or ..[@xml:lang]) and $targetElement = ('dc:title', 'dc:description', 'dc:subject')">
<xsl:attribute name="xml:lang">
<xsl:value-of select="(./@xml:lang, ../@xml:lang)[1]"/>
</xsl:attribute>
</xsl:if>
<xsl:value-of select="normalize-space(.)"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template name="title">
<xsl:param name="sourceElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="dc:title">
<xsl:if test=".[@xml:lang] or ..[@xml:lang]">
<xsl:attribute name="xml:lang">
<xsl:value-of select="(./@xml:lang, ../@xml:lang)[1]"/>
</xsl:attribute>
</xsl:if>
<xsl:value-of select="string-join((., ./following-sibling::*[local-name() = ('subtitle', 'trans-subtitle')])/normalize-space(.), ': ')"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template name="journal">
<xsl:param name="journalTitle"/>
<xsl:param name="issn"/>
<xsl:param name="eissn"/>
<xsl:param name="vol"/>
<xsl:param name="issue"/>
<xsl:param name="sp"/>
<xsl:param name="ep"/>
<xsl:element name="oaf:journal">
<xsl:attribute name="issn">
<xsl:value-of select="normalize-space($issn)"/>
</xsl:attribute>
<xsl:attribute name="eissn">
<xsl:value-of select="normalize-space($eissn)"/>
</xsl:attribute>
<xsl:attribute name="vol">
<xsl:value-of select="normalize-space($vol)"/>
</xsl:attribute>
<xsl:attribute name="iss">
<xsl:value-of select="normalize-space($issue)"/>
</xsl:attribute>
<xsl:attribute name="sp">
<xsl:value-of select="normalize-space($sp)"/>
</xsl:attribute>
<xsl:attribute name="ep">
<xsl:value-of select="normalize-space($ep)"/>
</xsl:attribute>
<xsl:value-of select="normalize-space($journalTitle)"/>
</xsl:element>
</xsl:template>
<xsl:template name="identifiers">
<xsl:param name="sourceElement"/>
<xsl:if test="string-length($sourceElement[@pub-id-type='doi']) gt 0">
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>doi</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type='doi']"/>
</xsl:element>
</xsl:if>
</xsl:template>
<xsl:template name="authors">
<xsl:param name="sourceElement"/>
<xsl:for-each select="$sourceElement">
<xsl:element name="dc:creator">
<xsl:if test="./*[local-name()='contrib-id'][@contrib-id-type='orcid']">
<xsl:attribute name="nameIdentifierScheme">
<xsl:text>ORCID</xsl:text>
</xsl:attribute>
<xsl:attribute name="schemeURI">
<xsl:text>http://orcid.org/</xsl:text>
</xsl:attribute>
<xsl:attribute name="nameIdentifier">
<xsl:value-of select="substring-after(./*[local-name()='contrib-id'][@contrib-id-type='orcid'], 'http://orcid.org/')"/>
</xsl:attribute>
</xsl:if>
<xsl:value-of select="concat(normalize-space(./*[local-name()='name']/*[local-name()='surname']), ', ', normalize-space(./*[local-name()='name']/*[local-name()='given-names']))"/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template match="//*[local-name() = 'header']">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate"/>
</xsl:element>
</xsl:copy>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>

View File

@ -0,0 +1,373 @@
<!-- for adaptation , 2021-06-14 PROD -->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:vocabulary="http://eu/dnetlib/transform/clean"
xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
exclude-result-prefixes="xsl vocabulary dateCleaner"
version="2.0">
<xsl:param name="varOfficialName" />
<xsl:param name="varDsType" />
<xsl:param name="varDataSourceId" />
<xsl:output indent="yes" omit-xml-declaration="yes" />
<xsl:param name="varHostedById" select="&apos;opendoar____::908&apos;" />
<xsl:param name="varHostedByName" select="&apos;Europe PubMed Central&apos;" />
<xsl:param name="varFP7FundRefDOI" select="&apos;10.13039/501100004963&apos;" />
<xsl:param name="varH2020FundRefDOI" select="&apos;10.13039/501100007601&apos;" />
<xsl:param name="varFP7" select="&apos;corda_______::&apos;" />
<xsl:param name="varH2020" select="&apos;corda__h2020::&apos;" />
<xsl:param name="epmcUrlPrefix" select="&apos;http://europepmc.org/articles/&apos;" />
<xsl:param name="repoCode" select="substring-before(//*[local-name() = &apos;header&apos;]/*[local-name()=&apos;recordIdentifier&apos;], &apos;:&apos;)" />
<xsl:param name="index" select="0" />
<xsl:param name="transDate" select="current-dateTime()" />
<xsl:variable name="year" select="format-number( ( //*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@pub-type=&apos;epub&apos;]/*[local-name()=&apos;year&apos;] | //*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@date-type=&apos;pub&apos; and @publication-format=&apos;electronic&apos;]/*[local-name()=&apos;year&apos;]), &apos;0000&apos;)" />
<xsl:variable name="month" select="format-number( (//*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@pub-type=&apos;epub&apos;]/*[local-name()=&apos;month&apos;] | //*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@date-type=&apos;pub&apos; and @publication-format=&apos;electronic&apos;]/*[local-name()=&apos;month&apos;]), &apos;00&apos;)" />
<xsl:variable name="day" select="format-number( (//*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@pub-type=&apos;epub&apos;]/*[local-name()=&apos;day&apos;] | //*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@date-type=&apos;pub&apos; and @publication-format=&apos;electronic&apos;]/*[local-name()=&apos;day&apos;]), &apos;00&apos;)" />
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template match="/">
<record>
<xsl:apply-templates select="//*[local-name() = &apos;header&apos;]" />
<metadata>
<xsl:if test="not(//*[local-name() = &apos;article-meta&apos;]//*[local-name()=&apos;article-title&apos;][string-length(normalize-space(.))&gt; 0])">
<xsl:call-template name="terminate" />
</xsl:if>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name() = &apos;article-meta&apos;]//*[local-name()=&apos;article-title&apos;][string-length(normalize-space(.))&gt; 0]" />
<xsl:with-param name="targetElement" select="&apos;dc:title&apos;" />
</xsl:call-template>
<xsl:call-template name="authors">
<!--
<xsl:with-param name="sourceElement" select="//*[local-name() = 'contrib'][@contrib-type='author']"/>
-->
<xsl:with-param name="sourceElement" select="//*[local-name() = &apos;article-meta&apos;]/*[local-name() = &apos;contrib-group&apos;]/*[local-name() = &apos;contrib&apos;][@contrib-type=&apos;author&apos;][not(exists(child::*:collab))][./*[local-name()=&apos;name&apos;] or ./*[local-name()=&apos;name-alternatives&apos;]/*[local-name()=&apos;name&apos;]][string-length(.//*[local-name()=&apos;surname&apos;]) + string-length(.//*[local-name()=&apos;given-names&apos;]) &gt; 0]" />
</xsl:call-template> <!-- <xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:contributor"/>
<xsl:with-param name="targetElement" select="'dc:contributor'"/>
</xsl:call-template>
-->
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;article-meta&apos;]/*[local-name()=&apos;abstract&apos;]" />
<xsl:with-param name="targetElement" select="&apos;dc:description&apos;" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;article-categories&apos;]//*[local-name()=&apos;subject&apos;]" />
<xsl:with-param name="targetElement" select="&apos;dc:subject&apos;" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;kwd-group&apos; and not(lower-case(@kwd-group-type)=(&apos;mesh&apos;, &apos;ocis&apos;))]//*[local-name()=&apos;kwd&apos;]" />
<xsl:with-param name="targetElement" select="&apos;dc:subject&apos;" />
</xsl:call-template>
<xsl:for-each select="//*[local-name()=&apos;kwd-group&apos; and lower-case(@kwd-group-type)=&apos;mesh&apos; and ./*[local-name()=&apos;kwd&apos;]]">
<xsl:for-each select="./*[local-name()=&apos;kwd&apos;]">
<dc:subject>
<xsl:attribute name="subjectScheme" select="&apos;mesh&apos;" />
<xsl:attribute name="schemeURI" select="&apos;http://www.nlm.nih.gov/mesh/&apos;" />
<xsl:attribute name="valueURI" select="&apos;&apos;" />
<xsl:value-of select="./concat(&apos;mesh:&apos;, replace(., &apos;mesh (.*)$&apos;, &apos;$1&apos;))" />
</dc:subject>
</xsl:for-each>
</xsl:for-each>
<xsl:for-each select="//*[local-name()=&apos;kwd-group&apos; and lower-case(@kwd-group-type)=&apos;ocis&apos; and ./*[local-name()=&apos;kwd&apos;]]">
<xsl:for-each select="./*[local-name()=&apos;kwd&apos;]">
<dc:subject>
<xsl:attribute name="subjectScheme" select="&apos;ocis&apos;" />
<xsl:attribute name="schemeURI" select="&apos;&apos;" />
<xsl:attribute name="valueURI" select="&apos;&apos;" />
<xsl:value-of select="./concat(&apos;ocis:&apos;, .)" />
</dc:subject>
</xsl:for-each>
</xsl:for-each>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;publisher&apos;]/*[local-name()=&apos;publisher-name&apos;]" />
<xsl:with-param name="targetElement" select="&apos;dc:publisher&apos;" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;journal-meta&apos;]//*[local-name()=&apos;journal-title&apos;]" />
<xsl:with-param name="targetElement" select="&apos;dc:source&apos;" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name() = &apos;article-meta&apos;]/(*[local-name() = &apos;article-version-alternatives&apos;]/*[local-name() = &apos;article-version&apos;], *[local-name() = &apos;article-version&apos;])/concat(&apos;article-version (&apos;, @article-version-type, &apos;) &apos;, .)" />
<xsl:with-param name="targetElement" select="&apos;dc:source&apos;" />
</xsl:call-template>
<xsl:element name="dc:language">
<xsl:text>eng</xsl:text>
</xsl:element>
<xsl:element name="dc:identifier">
<xsl:value-of select="concat($epmcUrlPrefix, //*[local-name()=&apos;article-id&apos;][@pub-id-type=&apos;pmcid&apos;])" />
</xsl:element>
<xsl:element name="oaf:fulltext">
<xsl:value-of select="concat($epmcUrlPrefix, //*[local-name()=&apos;article-id&apos;][@pub-id-type=&apos;pmcid&apos;])" />
</xsl:element>
<xsl:element name="oaf:dateAccepted">
<xsl:choose>
<xsl:when test="//*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@pub-type=&apos;epub&apos;] or //*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@date-type=&apos;pub&apos; and @publication-format=&apos;electronic&apos;]">
<xsl:if test="string(number($month)) eq &apos;NaN&apos;">
<xsl:value-of select="concat($year, &apos;-&apos;, &apos;01&apos;, &apos;-&apos;, &apos;01&apos;)" />
</xsl:if>
<xsl:if test="string(number($month)) != &apos;NaN&apos;">
<xsl:value-of select="concat($year, &apos;-&apos;, $month, &apos;-&apos;, &apos;01&apos;)" />
</xsl:if>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="concat(//*[local-name()=&apos;article-meta&apos;]//*[local-name()=&apos;pub-date&apos;][@pub-type=&apos;ppub&apos;]/*[local-name()=&apos;year&apos;], &apos;-01-01&apos;)" />
</xsl:otherwise>
</xsl:choose>
</xsl:element>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="distinct-values(//*[local-name()=&apos;permissions&apos;]/*[local-name()=&apos;copyright-statement&apos;])" />
<xsl:with-param name="targetElement" select="&apos;dc:rights&apos;" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="distinct-values(//*[local-name()=&apos;permissions&apos;]/*[local-name()=&apos;license&apos;])" />
<xsl:with-param name="targetElement" select="&apos;dc:rights&apos;" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;fn-group&apos;]//*[local-name()=&apos;fn&apos;]" />
<xsl:with-param name="targetElement" select="&apos;dc:relation&apos;" />
</xsl:call-template>
<xsl:call-template name="identifiers">
<xsl:with-param name="sourceElement" select="//*[local-name()=&apos;article-id&apos;]" />
</xsl:call-template>
<xsl:for-each select="//*[local-name()=&apos;award-group&apos;][.//*[local-name()=&apos;institution-id&apos;][ends-with(., $varFP7FundRefDOI)]]">
<xsl:if test="./*[local-name()=&apos;award-id&apos;][matches(normalize-space(.), &apos;(^\d\d\d\d\d\d$)&apos;, &apos;i&apos;)]">
<oaf:projectid>
<xsl:value-of select="concat($varFP7, ./*[local-name()=&apos;award-id&apos;])" />
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:for-each select="//*[local-name()=&apos;award-group&apos;][.//*[local-name()=&apos;institution-id&apos;][ends-with(., $varH2020FundRefDOI)]]">
<xsl:if test="./*[local-name()=&apos;award-id&apos;][matches(normalize-space(.), &apos;(^\d\d\d\d\d\d$)&apos;, &apos;i&apos;)]">
<oaf:projectid>
<xsl:value-of select="concat($varH2020, ./*[local-name()=&apos;award-id&apos;])" />
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:element name="oaf:accessrights">
<xsl:text>OPEN</xsl:text>
</xsl:element>
<xsl:element name="dr:CobjCategory">
<xsl:attribute name="type" select="&apos;publication&apos;" />
<xsl:text>0001</xsl:text>
</xsl:element>
<dc:type>
<xsl:value-of select="//*[local-name() = &apos;article&apos;]/@article-type" />
</dc:type>
<xsl:variable name="varRefereedConvt" select="for $i in (//*[local-name() = 'resource']/*[local-name() = ('resourceType', 'version')]/(., @uri))
return vocabulary:clean( normalize-space($i), 'dnet:review_levels')"/>
<!-- <xsl:variable name="varRefereedConvt" select="for $i in distinct-values((//*[local-name() = &apos;article&apos;]/@article-type, //oai:setSpec))
return TransformationFunction:convertString($tf, normalize-space($i), &apos;ReviewLevels&apos;)" />
-->
<xsl:choose>
<xsl:when test="count($varRefereedConvt[. = &apos;0001&apos;]) &gt; 0">
<oaf:refereed>
<xsl:value-of select="&apos;0001&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = &apos;article-meta&apos;]/*[local-name() = &apos;article-id&apos;][@pub-id-type=&apos;doi&apos;][matches(., &apos;^(https?://(dx\.)?doi.org/)?10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$&apos;)]">
<oaf:refereed>
<xsl:value-of select="&apos;0001&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = &apos;article-meta&apos;]/*[local-name() = (&apos;abstract&apos;, &apos;trans-abstract&apos;)][matches(lower-case(.), &apos;^\s*(.p.\s*)?refereed\s*article.*&apos;)]">
<oaf:refereed>
<xsl:value-of select="&apos;0001&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = &apos;article&apos;]/*[local-name() = (&apos;back&apos;, &apos;front&apos;)]/*[local-name() = (&apos;fn-group&apos;, &apos;notes&apos;)][
matches(lower-case(.), &apos;.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*&apos;) or
matches(lower-case(.), &apos;.*peer[\.\-_/\s\(\)]*review\s*information.*&apos;) or
matches(lower-case(.), &apos;.*the\s*peer[\.\-_/\s\(\)]*review\s*history\s*for\s*this\s*article\s*is\s*available\s*at .*&apos;) or
matches(lower-case(.), &apos;.*provenance\s*and\s*peer[\.\-_/\s\(\)]*review.*&apos;) or
matches(lower-case(.), &apos;.*externally\s*peer[\.\-_/\s\(\)]*reviewed.*&apos;) or
matches(lower-case(.), &apos;.*peer[\.\-_/\s\(\)]*reviewed\s*by.*&apos;) or
matches(lower-case(.), &apos;.*refereed\s*anonymously.*&apos;) or
matches(lower-case(.), &apos;.*peer\s*reviewer\s*reports\s*are\s*available.*&apos;) or
matches(lower-case(.), &apos;.*\[.*peer[\s\-\._]*review\s*:.*\].*&apos;) or
matches(lower-case(.), &apos;.*\[.*referees\s*:.*\].*&apos;) or
matches(lower-case(.), &apos;^\s*plagiarism[\s\-\._]check.*&apos;) or
matches(lower-case(.), &apos;^\s*peer[\s\-\._]*review.*&apos;) or
matches(lower-case(.), &apos;^\s*(open\s*peer[\s\-\._]*|p-)reviewer.*&apos;) or
matches(lower-case(.), &apos;^\s*(open\s*peer[\s\-\._]*|p-)review\s*reports?.*&apos;)]">
<oaf:refereed>
<xsl:value-of select="&apos;0001&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = (&apos;article-meta&apos;, &apos;app&apos;, &apos;app-group&apos;)]/*[local-name() = &apos;supplementary-material&apos;]/*[local-name() = &apos;media&apos;][
matches(lower-case(.), &apos;.*peer\s*review\s*file.*&apos;)]">
<oaf:refereed>
<xsl:value-of select="&apos;0001&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = &apos;article-meta&apos;]/*[local-name() = &apos;contrib-group&apos;]
[./@role/lower-case(.) = (&apos;reviewer&apos;, &apos;solicited external reviewer&apos;) or
./*[local-name() = &apos;contrib&apos;][./@role/lower-case(.) = (&apos;reviewer&apos;, &apos;solicited external reviewer&apos;) or ./*[local-name() = &apos;role&apos; and lower-case(.) = (&apos;reviewer&apos;, &apos;solicited external reviewer&apos;)] or ./@contrib-type/lower-case(.) = &apos;reviewer&apos;]]">
<oaf:refereed>
<xsl:value-of select="&apos;0001&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="count($varRefereedConvt[. = &apos;0002&apos;]) &gt; 0">
<oaf:refereed>
<xsl:value-of select="&apos;0002&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = (&apos;related-article&apos;)][./@related-article-type = (&apos;peer-reviewed-article&apos;, &apos;reviewed-article&apos;)]">
<oaf:refereed>
<xsl:value-of select="&apos;0002&apos;" />
</oaf:refereed>
</xsl:when>
<xsl:when test="//*[local-name() = &apos;article-meta&apos;][./*[local-name() = &apos;article-version-alternatives&apos;]/*[local-name() = &apos;article-version&apos; and . = &apos;preprint&apos;] or ./*[local-name() = &apos;article-version&apos; and . = &apos;preprint&apos;]]">
<oaf:refereed>
<xsl:value-of select="&apos;0002&apos;" />
</oaf:refereed>
</xsl:when>
</xsl:choose>
<xsl:call-template name="journal">
<xsl:with-param name="journalTitle" select="//*[local-name()=&apos;journal-meta&apos;]//*[local-name()=&apos;journal-title&apos;]" />
<xsl:with-param name="issn" select="//*[local-name()=&apos;journal-meta&apos;]/*[local-name()=&apos;issn&apos;][@pub-type=&apos;ppub&apos;]" />
<xsl:with-param name="eissn" select="//*[local-name()=&apos;journal-meta&apos;]/*[local-name()=&apos;issn&apos;][@pub-type=&apos;epub&apos;]" />
<xsl:with-param name="vol" select="//*[local-name()=&apos;article-meta&apos;]/*[local-name()=&apos;volume&apos;]" />
<xsl:with-param name="issue" select="//*[local-name()=&apos;article-meta&apos;]/*[local-name()=&apos;issue&apos;]" />
<xsl:with-param name="sp" select="//*[local-name()=&apos;article-meta&apos;]/*[local-name()=&apos;fpage&apos;]" />
<xsl:with-param name="ep" select="//*[local-name()=&apos;article-meta&apos;]/*[local-name()=&apos;lpage&apos;]" />
</xsl:call-template>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="$varHostedByName" />
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varHostedById" />
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName" />
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId" />
</xsl:attribute>
</oaf:collectedFrom>
<xsl:for-each select="//*[local-name() = &apos;article&apos;]/*[local-name() = (&apos;back&apos;, &apos;front&apos;)]/*[local-name() = &apos;fn-group&apos;]/*[local-name() = &apos;fn&apos;][matches(lower-case(.), &apos;country(/territory)? of origin:?\s*[A-Za-z\-]+&apos;)]">
<oaf:country>
<!--
<xsl:value-of select="TransformationFunction:convertString($tf, replace(lower-case(.), '^(.|\s)*country(/territory)? of origin:?\s+([A-Za-z\-,\(\)]+(\s+[A-Za-z\-,\(\)]+)*)(.|\s)*$', '$3'), 'Countries')"/>
-->
<!-- ACz, 2021-06-14
<xsl:value-of select="TransformationFunction:convertString($tf, normalize-space(substring(substring-after(lower-case(.), &apos;of origin&apos;), 2)), &apos;Countries&apos;)" />
-->
<xsl:value-of select="vocabulary:clean( normalize-space(substring(substring-after(lower-case(.), &apos;of origin&apos;), 2)), 'dnet:countries')"/>
</oaf:country>
</xsl:for-each>
</metadata>
<xsl:copy-of select="//*[local-name() = &apos;about&apos;]" />
</record>
</xsl:template>
<xsl:template name="allElements">
<xsl:param name="sourceElement" />
<xsl:param name="targetElement" />
<xsl:for-each select="$sourceElement">
<xsl:element name="{$targetElement}">
<xsl:value-of select="normalize-space(.)" />
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template name="journal">
<xsl:param name="journalTitle" />
<xsl:param name="issn" />
<xsl:param name="eissn" />
<xsl:param name="vol" />
<xsl:param name="issue" />
<xsl:param name="sp" />
<xsl:param name="ep" />
<xsl:element name="oaf:journal">
<xsl:attribute name="issn">
<xsl:value-of select="normalize-space($issn)" />
</xsl:attribute>
<xsl:attribute name="eissn">
<xsl:value-of select="normalize-space($eissn)" />
</xsl:attribute>
<xsl:attribute name="vol">
<xsl:value-of select="normalize-space($vol)" />
</xsl:attribute>
<xsl:attribute name="iss">
<xsl:value-of select="normalize-space($issue)" />
</xsl:attribute>
<xsl:attribute name="sp">
<xsl:value-of select="normalize-space($sp)" />
</xsl:attribute>
<xsl:attribute name="ep">
<xsl:value-of select="normalize-space($ep)" />
</xsl:attribute>
<xsl:value-of select="normalize-space($journalTitle)" />
</xsl:element>
</xsl:template>
<xsl:template name="identifiers">
<xsl:param name="sourceElement" />
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>doi</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type=&apos;doi&apos;]" />
</xsl:element>
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>pmc</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type=&apos;pmcid&apos;]" />
</xsl:element>
<xsl:element name="oaf:identifier">
<xsl:attribute name="identifierType">
<xsl:text>pmid</xsl:text>
</xsl:attribute>
<xsl:value-of select="$sourceElement[@pub-id-type=&apos;pmid&apos;]" />
</xsl:element>
</xsl:template>
<xsl:template name="authors">
<xsl:param name="sourceElement" />
<xsl:for-each select="$sourceElement">
<xsl:element name="dc:creator">
<xsl:if test="./*[local-name()=&apos;contrib-id&apos;][@contrib-id-type=&apos;orcid&apos;]">
<xsl:attribute name="nameIdentifierScheme">
<xsl:text>ORCID</xsl:text>
</xsl:attribute>
<xsl:attribute name="schemeURI">
<xsl:text>http://orcid.org/</xsl:text>
</xsl:attribute>
<xsl:attribute name="nameIdentifier">
<xsl:value-of select="substring-after(./*[local-name()=&apos;contrib-id&apos;][@contrib-id-type=&apos;orcid&apos;], &apos;http://orcid.org/&apos;)" />
</xsl:attribute>
</xsl:if> <!--
<xsl:value-of select="concat(normalize-space(./*[local-name()='name']/*[local-name()='surname']), ', ', normalize-space(./*[local-name()='name']/*[local-name()='given-names']))"/>
-->
<xsl:value-of select="concat(normalize-space(./(*[local-name()=&apos;name&apos;], *[local-name()=&apos;name-alternatives&apos;]/*[local-name()=&apos;name&apos;])/*[local-name()=&apos;surname&apos;]), &apos;, &apos;, normalize-space(./(*[local-name()=&apos;name&apos;], *[local-name()=&apos;name-alternatives&apos;]/*[local-name()=&apos;name&apos;])/*[local-name()=&apos;given-names&apos;]))" />
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template match="//*[local-name() = &apos;header&apos;]">
<xsl:copy>
<xsl:apply-templates select="node()|@*" />
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate" />
</xsl:element>
</xsl:copy>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*" />
</xsl:copy>
</xsl:template>
</xsl:stylesheet>

View File

@ -4,6 +4,8 @@ package eu.dnetlib.dhp.broker.oa;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
@ -47,26 +49,22 @@ public class CheckDuplictedIdsJob {
final LongAccumulator total = spark.sparkContext().longAccumulator("invaild_event_id");
final TypedColumn<Tuple2<String, Long>, Tuple2<String, Long>> agg = new CountAggregator().toColumn();
final Encoder<Tuple2<String, Long>> encoder = Encoders.tuple(Encoders.STRING(), Encoders.LONG());
ClusterUtils
.readPath(spark, eventsPath, Event.class)
.map(e -> new Tuple2<>(e.getEventId(), 1l), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
.groupByKey(t -> t._1, Encoders.STRING())
.agg(agg)
.map(t -> t._2, Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
.filter(t -> t._2 > 1)
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
.map((MapFunction<Event, Tuple2<String, Long>>) e -> new Tuple2<>(e.getEventId(), 1l), encoder)
.groupByKey((MapFunction<Tuple2<String, Long>, String>) t -> t._1, Encoders.STRING())
.agg(new CountAggregator().toColumn())
.map((MapFunction<Tuple2<String, Tuple2<String, Long>>, Tuple2<String, Long>>) t -> t._2, encoder)
.filter((FilterFunction<Tuple2<String, Long>>) t -> t._2 > 1)
.map(
(MapFunction<Tuple2<String, Long>, Tuple2<String, Long>>) o -> ClusterUtils
.incrementAccumulator(o, total),
encoder)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(countPath);
;
}
private static String eventAsJsonString(final Event f) throws JsonProcessingException {
return new ObjectMapper().writeValueAsString(f);
}
}

View File

@ -12,6 +12,8 @@ import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.util.LongAccumulator;
@ -77,11 +79,11 @@ public class GenerateEventsJob {
final Dataset<Event> dataset = groups
.map(
g -> EventFinder
(MapFunction<ResultGroup, EventGroup>) g -> EventFinder
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, topicWhitelist, accumulators),
Encoders
.bean(EventGroup.class))
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
.flatMap((FlatMapFunction<EventGroup, Event>) g -> g.getData().iterator(), Encoders.bean(Event.class));
ClusterUtils.save(dataset, eventsPath, Event.class, total);

View File

@ -13,6 +13,7 @@ import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.TypedColumn;
@ -24,6 +25,7 @@ import eu.dnetlib.dhp.broker.model.Event;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.aggregators.stats.DatasourceStats;
import eu.dnetlib.dhp.broker.oa.util.aggregators.stats.StatsAggregator;
import scala.Tuple2;
public class GenerateStatsJob {
@ -71,9 +73,14 @@ public class GenerateStatsJob {
ClusterUtils
.readPath(spark, eventsPath, Event.class)
.groupByKey(e -> e.getTopic() + "@@@" + e.getMap().getTargetDatasourceId(), Encoders.STRING())
.groupByKey(
(MapFunction<Event, String>) e -> e.getTopic() + "@@@" + e.getMap().getTargetDatasourceId(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(DatasourceStats.class))
.map(
(MapFunction<Tuple2<String, DatasourceStats>, DatasourceStats>) t -> t._2,
Encoders.bean(DatasourceStats.class))
.coalesce(1)
.write()
.mode(SaveMode.Overwrite)
.jdbc(dbUrl, "oa_datasource_stats_temp", connectionProperties);

View File

@ -13,6 +13,8 @@ import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
@ -30,6 +32,7 @@ import eu.dnetlib.dhp.broker.model.Event;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.EventGroup;
import eu.dnetlib.dhp.broker.oa.util.aggregators.subset.EventSubsetAggregator;
import scala.Tuple2;
public class IndexEventSubsetJob {
@ -83,13 +86,15 @@ public class IndexEventSubsetJob {
final Dataset<Event> subset = ClusterUtils
.readPath(spark, eventsPath, Event.class)
.groupByKey(e -> e.getTopic() + '@' + e.getMap().getTargetDatasourceId(), Encoders.STRING())
.groupByKey(
(MapFunction<Event, String>) e -> e.getTopic() + '@' + e.getMap().getTargetDatasourceId(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(EventGroup.class))
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
.map((MapFunction<Tuple2<String, EventGroup>, EventGroup>) t -> t._2, Encoders.bean(EventGroup.class))
.flatMap((FlatMapFunction<EventGroup, Event>) g -> g.getData().iterator(), Encoders.bean(Event.class));
final JavaRDD<String> inputRdd = subset
.map(e -> prepareEventForIndexing(e, now, total), Encoders.STRING())
.map((MapFunction<Event, String>) e -> prepareEventForIndexing(e, now, total), Encoders.STRING())
.javaRDD();
final Map<String, String> esCfg = new HashMap<>();

View File

@ -18,7 +18,10 @@ import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
@ -89,13 +92,17 @@ public class IndexNotificationsJob {
log.info("Number of subscriptions: " + subscriptions.size());
if (subscriptions.size() > 0) {
final Encoder<NotificationGroup> ngEncoder = Encoders.bean(NotificationGroup.class);
final Encoder<Notification> nEncoder = Encoders.bean(Notification.class);
final Dataset<Notification> notifications = ClusterUtils
.readPath(spark, eventsPath, Event.class)
.map(e -> generateNotifications(e, subscriptions, startTime), Encoders.bean(NotificationGroup.class))
.flatMap(g -> g.getData().iterator(), Encoders.bean(Notification.class));
.map(
(MapFunction<Event, NotificationGroup>) e -> generateNotifications(e, subscriptions, startTime),
ngEncoder)
.flatMap((FlatMapFunction<NotificationGroup, Notification>) g -> g.getData().iterator(), nEncoder);
final JavaRDD<String> inputRdd = notifications
.map(n -> prepareForIndexing(n, total), Encoders.STRING())
.map((MapFunction<Notification, String>) n -> prepareForIndexing(n, total), Encoders.STRING())
.javaRDD();
final Map<String, String> esCfg = new HashMap<>();
@ -192,15 +199,11 @@ public class IndexNotificationsJob {
return false;
}
if (conditions.containsKey("targetSubjects")
&& !conditions
return !conditions.containsKey("targetSubjects")
|| conditions
.get("targetSubjects")
.stream()
.allMatch(c -> SubscriptionUtils.verifyListExact(map.getTargetSubjects(), c.getValue()))) {
return false;
}
return true;
.allMatch(c -> SubscriptionUtils.verifyListExact(map.getTargetSubjects(), c.getValue()));
}

View File

@ -7,6 +7,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.TypedColumn;
@ -67,9 +68,13 @@ public class JoinStep0Job {
final Dataset<OaBrokerMainEntity> dataset = sources
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
.groupByKey(t -> t._1.getOpenaireId(), Encoders.STRING())
.groupByKey(
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedDatasource>, String>) t -> t._1.getOpenaireId(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
.map(
(MapFunction<Tuple2<String, OaBrokerMainEntity>, OaBrokerMainEntity>) t -> t._2,
Encoders.bean(OaBrokerMainEntity.class));
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);

View File

@ -69,7 +69,9 @@ public class JoinStep1Job {
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedProject>, String>) t -> t._1.getOpenaireId(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
.map(
(MapFunction<Tuple2<String, OaBrokerMainEntity>, OaBrokerMainEntity>) t -> t._2,
Encoders.bean(OaBrokerMainEntity.class));
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);

View File

@ -7,6 +7,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.TypedColumn;
@ -64,9 +65,13 @@ public class JoinStep2Job {
final Dataset<OaBrokerMainEntity> dataset = sources
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
.groupByKey(t -> t._1.getOpenaireId(), Encoders.STRING())
.groupByKey(
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedSoftware>, String>) t -> t._1.getOpenaireId(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
.map(
(MapFunction<Tuple2<String, OaBrokerMainEntity>, OaBrokerMainEntity>) t -> t._2,
Encoders.bean(OaBrokerMainEntity.class));
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);

View File

@ -69,7 +69,9 @@ public class JoinStep3Job {
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedDataset>, String>) t -> t._1.getOpenaireId(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
.map(
(MapFunction<Tuple2<String, OaBrokerMainEntity>, OaBrokerMainEntity>) t -> t._2,
Encoders.bean(OaBrokerMainEntity.class));
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);

View File

@ -69,7 +69,9 @@ public class JoinStep4Job {
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedPublication>, String>) t -> t._1.getOpenaireId(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
.map(
(MapFunction<Tuple2<String, OaBrokerMainEntity>, OaBrokerMainEntity>) t -> t._2,
Encoders.bean(OaBrokerMainEntity.class));
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);

View File

@ -7,6 +7,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
@ -64,7 +65,7 @@ public class PrepareGroupsJob {
final Dataset<Relation> mergedRels = ClusterUtils
.loadRelations(graphPath, spark)
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
.filter((FilterFunction<Relation>) r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()
.toColumn();
@ -75,8 +76,9 @@ public class PrepareGroupsJob {
(MapFunction<Tuple2<OaBrokerMainEntity, Relation>, String>) t -> t._2.getTarget(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(ResultGroup.class))
.filter(rg -> rg.getData().size() > 1);
.map(
(MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class))
.filter((FilterFunction<ResultGroup>) rg -> rg.getData().size() > 1);
ClusterUtils.save(dataset, groupsPath, ResultGroup.class, total);

View File

@ -7,6 +7,8 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.util.LongAccumulator;
@ -20,6 +22,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class PrepareRelatedDatasetsJob {
@ -58,20 +61,22 @@ public class PrepareRelatedDatasetsJob {
final Dataset<OaBrokerRelatedDataset> datasets = ClusterUtils
.readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class)
.filter(d -> !ClusterUtils.isDedupRoot(d.getId()))
.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));
.filter((FilterFunction<eu.dnetlib.dhp.schema.oaf.Dataset>) d -> !ClusterUtils.isDedupRoot(d.getId()))
.map(
(MapFunction<eu.dnetlib.dhp.schema.oaf.Dataset, OaBrokerRelatedDataset>) ConversionUtils::oafDatasetToBrokerDataset,
Encoders.bean(OaBrokerRelatedDataset.class));
final Dataset<Relation> rels = ClusterUtils
.loadRelations(graphPath, spark)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
.filter((FilterFunction<Relation>) r -> r.getDataInfo().getDeletedbyinference())
.filter((FilterFunction<Relation>) r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter((FilterFunction<Relation>) r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget()));
final Dataset<RelatedDataset> dataset = rels
.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
.map(t -> {
.map((MapFunction<Tuple2<Relation, OaBrokerRelatedDataset>, RelatedDataset>) t -> {
final RelatedDataset rel = new RelatedDataset(t._1.getSource(),
t._2);
rel.getRelDataset().setRelType(t._1.getRelClass());

View File

@ -7,6 +7,9 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
@ -25,6 +28,7 @@ import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import scala.Tuple2;
import scala.Tuple3;
public class PrepareRelatedDatasourcesJob {
@ -70,17 +74,20 @@ public class PrepareRelatedDatasourcesJob {
final Dataset<OaBrokerRelatedDatasource> datasources = ClusterUtils
.readPath(spark, graphPath + "/datasource", Datasource.class)
.map(ConversionUtils::oafDatasourceToBrokerDatasource, Encoders.bean(OaBrokerRelatedDatasource.class));
.map(
(MapFunction<Datasource, OaBrokerRelatedDatasource>) ConversionUtils::oafDatasourceToBrokerDatasource,
Encoders.bean(OaBrokerRelatedDatasource.class));
final Dataset<RelatedDatasource> dataset = rels
.joinWith(datasources, datasources.col("openaireId").equalTo(rels.col("_2")), "inner")
.map(t -> {
final RelatedDatasource r = new RelatedDatasource();
r.setSource(t._1._1());
r.setRelDatasource(t._2);
r.getRelDatasource().setRelType(t._1._3());
return r;
}, Encoders.bean(RelatedDatasource.class));
.map(
(MapFunction<Tuple2<Tuple3<String, String, String>, OaBrokerRelatedDatasource>, RelatedDatasource>) t -> {
final RelatedDatasource r = new RelatedDatasource();
r.setSource(t._1._1());
r.setRelDatasource(t._2);
r.getRelDatasource().setRelType(t._1._3());
return r;
}, Encoders.bean(RelatedDatasource.class));
ClusterUtils.save(dataset, relsPath, RelatedDatasource.class, total);
@ -88,19 +95,22 @@ public class PrepareRelatedDatasourcesJob {
}
private static final Dataset<Tuple3<String, String, String>> prepareResultTuples(final SparkSession spark,
private static final <T extends Result> Dataset<Tuple3<String, String, String>> prepareResultTuples(
final SparkSession spark,
final String graphPath,
final Class<? extends Result> sourceClass) {
final Class<T> sourceClass) {
return ClusterUtils
.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
.filter(r -> !ClusterUtils.isDedupRoot(r.getId()))
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter((FilterFunction<T>) r -> !ClusterUtils.isDedupRoot(r.getId()))
.filter((FilterFunction<T>) r -> r.getDataInfo().getDeletedbyinference())
.map(
r -> DatasourceRelationsAccumulator.calculateTuples(r),
(MapFunction<T, DatasourceRelationsAccumulator>) r -> DatasourceRelationsAccumulator.calculateTuples(r),
Encoders.bean(DatasourceRelationsAccumulator.class))
.flatMap(
acc -> acc.getRels().iterator(),
(FlatMapFunction<DatasourceRelationsAccumulator, Tuple3<String, String, String>>) acc -> acc
.getRels()
.iterator(),
Encoders.tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING()));
}

View File

@ -7,6 +7,8 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.util.LongAccumulator;
@ -22,6 +24,7 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class PrepareRelatedProjectsJob {
@ -60,20 +63,25 @@ public class PrepareRelatedProjectsJob {
final Dataset<OaBrokerProject> projects = ClusterUtils
.readPath(spark, graphPath + "/project", Project.class)
.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
.map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class));
.filter((FilterFunction<Project>) p -> !ClusterUtils.isDedupRoot(p.getId()))
.map(
(MapFunction<Project, OaBrokerProject>) ConversionUtils::oafProjectToBrokerProject,
Encoders.bean(OaBrokerProject.class));
final Dataset<Relation> rels = ClusterUtils
.loadRelations(graphPath, spark)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
.filter((FilterFunction<Relation>) r -> r.getDataInfo().getDeletedbyinference())
.filter((FilterFunction<Relation>) r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
.filter((FilterFunction<Relation>) r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget()));
final Dataset<RelatedProject> dataset = rels
.joinWith(projects, projects.col("openaireId").equalTo(rels.col("target")), "inner")
.map(t -> new RelatedProject(t._1.getSource(), t._2), Encoders.bean(RelatedProject.class));
.map(
(MapFunction<Tuple2<Relation, OaBrokerProject>, RelatedProject>) t -> new RelatedProject(
t._1.getSource(), t._2),
Encoders.bean(RelatedProject.class));
ClusterUtils.save(dataset, relsPath, RelatedProject.class, total);

View File

@ -7,6 +7,8 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.util.LongAccumulator;
@ -21,6 +23,7 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class PrepareRelatedPublicationsJob {
@ -59,22 +62,22 @@ public class PrepareRelatedPublicationsJob {
final Dataset<OaBrokerRelatedPublication> pubs = ClusterUtils
.readPath(spark, graphPath + "/publication", Publication.class)
.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
.filter((FilterFunction<Publication>) p -> !ClusterUtils.isDedupRoot(p.getId()))
.map(
ConversionUtils::oafPublicationToBrokerPublication,
(MapFunction<Publication, OaBrokerRelatedPublication>) ConversionUtils::oafPublicationToBrokerPublication,
Encoders.bean(OaBrokerRelatedPublication.class));
final Dataset<Relation> rels = ClusterUtils
.loadRelations(graphPath, spark)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
.filter((FilterFunction<Relation>) r -> r.getDataInfo().getDeletedbyinference())
.filter((FilterFunction<Relation>) r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter((FilterFunction<Relation>) r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget()));
final Dataset<RelatedPublication> dataset = rels
.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
.map(t -> {
.map((MapFunction<Tuple2<Relation, OaBrokerRelatedPublication>, RelatedPublication>) t -> {
final RelatedPublication rel = new RelatedPublication(
t._1.getSource(), t._2);
rel.getRelPublication().setRelType(t._1.getRelClass());

View File

@ -7,7 +7,10 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
@ -22,6 +25,7 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
import scala.Tuple2;
public class PrepareRelatedSoftwaresJob {
@ -58,22 +62,30 @@ public class PrepareRelatedSoftwaresJob {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels");
final Encoder<OaBrokerRelatedSoftware> obrsEncoder = Encoders.bean(OaBrokerRelatedSoftware.class);
final Dataset<OaBrokerRelatedSoftware> softwares = ClusterUtils
.readPath(spark, graphPath + "/software", Software.class)
.filter(sw -> !ClusterUtils.isDedupRoot(sw.getId()))
.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));
.filter((FilterFunction<Software>) sw -> !ClusterUtils.isDedupRoot(sw.getId()))
.map(
(MapFunction<Software, OaBrokerRelatedSoftware>) ConversionUtils::oafSoftwareToBrokerSoftware,
obrsEncoder);
final Dataset<Relation> rels = ClusterUtils
final Dataset<Relation> rels;
rels = ClusterUtils
.loadRelations(graphPath, spark)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
.filter((FilterFunction<Relation>) r -> r.getDataInfo().getDeletedbyinference())
.filter((FilterFunction<Relation>) r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter((FilterFunction<Relation>) r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget()));
final Encoder<RelatedSoftware> rsEncoder = Encoders.bean(RelatedSoftware.class);
final Dataset<RelatedSoftware> dataset = rels
.joinWith(softwares, softwares.col("openaireId").equalTo(rels.col("target")), "inner")
.map(t -> new RelatedSoftware(t._1.getSource(), t._2), Encoders.bean(RelatedSoftware.class));
.map(
(MapFunction<Tuple2<Relation, OaBrokerRelatedSoftware>, RelatedSoftware>) t -> new RelatedSoftware(
t._1.getSource(), t._2),
rsEncoder);
ClusterUtils.save(dataset, relsPath, RelatedSoftware.class, total);

View File

@ -7,7 +7,10 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
@ -73,11 +76,12 @@ public class PrepareSimpleEntititiesJob {
final String graphPath,
final Class<SRC> sourceClass) {
final Encoder<OaBrokerMainEntity> encoder = Encoders.bean(OaBrokerMainEntity.class);
return ClusterUtils
.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
.filter(r -> !ClusterUtils.isDedupRoot(r.getId()))
.filter(r -> r.getDataInfo().getDeletedbyinference())
.map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class));
.filter((FilterFunction<SRC>) r -> !ClusterUtils.isDedupRoot(r.getId()))
.filter((FilterFunction<SRC>) r -> r.getDataInfo().getDeletedbyinference())
.map((MapFunction<SRC, OaBrokerMainEntity>) ConversionUtils::oafResultToBrokerResult, encoder);
}
}

Some files were not shown because too many files have changed in this diff Show More