forked from D-Net/dnet-hadoop
Merge branch 'stable_ids' of code-repo.d4science.org:D-Net/dnet-hadoop into stable_ids
This commit is contained in:
commit
680bfa490f
|
@ -21,6 +21,10 @@
|
|||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.github.sisyphsu</groupId>
|
||||
<artifactId>dateparser</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
|
|
|
@ -1,182 +0,0 @@
|
|||
|
||||
package eu.dnetlib.data.mdstore.manager.common.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
|
||||
import javax.persistence.Column;
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Table;
|
||||
import javax.persistence.Temporal;
|
||||
import javax.persistence.TemporalType;
|
||||
|
||||
@Entity
|
||||
@Table(name = "mdstores")
|
||||
public class MDStore implements Serializable {
|
||||
|
||||
/** */
|
||||
private static final long serialVersionUID = 3160530489149700055L;
|
||||
|
||||
@Id
|
||||
@Column(name = "id")
|
||||
private String id;
|
||||
|
||||
@Column(name = "format")
|
||||
private String format;
|
||||
|
||||
@Column(name = "layout")
|
||||
private String layout;
|
||||
|
||||
@Column(name = "interpretation")
|
||||
private String interpretation;
|
||||
|
||||
@Column(name = "datasource_name")
|
||||
private String datasourceName;
|
||||
|
||||
@Column(name = "datasource_id")
|
||||
private String datasourceId;
|
||||
|
||||
@Column(name = "api_id")
|
||||
private String apiId;
|
||||
|
||||
@Column(name = "hdfs_path")
|
||||
private String hdfsPath;
|
||||
|
||||
@Column(name = "creation_date")
|
||||
@Temporal(TemporalType.TIMESTAMP)
|
||||
private Date creationDate;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getFormat() {
|
||||
return format;
|
||||
}
|
||||
|
||||
public void setFormat(final String format) {
|
||||
this.format = format;
|
||||
}
|
||||
|
||||
public String getLayout() {
|
||||
return layout;
|
||||
}
|
||||
|
||||
public void setLayout(final String layout) {
|
||||
this.layout = layout;
|
||||
}
|
||||
|
||||
public String getInterpretation() {
|
||||
return interpretation;
|
||||
}
|
||||
|
||||
public void setInterpretation(final String interpretation) {
|
||||
this.interpretation = interpretation;
|
||||
}
|
||||
|
||||
public String getDatasourceName() {
|
||||
return datasourceName;
|
||||
}
|
||||
|
||||
public void setDatasourceName(final String datasourceName) {
|
||||
this.datasourceName = datasourceName;
|
||||
}
|
||||
|
||||
public String getDatasourceId() {
|
||||
return datasourceId;
|
||||
}
|
||||
|
||||
public void setDatasourceId(final String datasourceId) {
|
||||
this.datasourceId = datasourceId;
|
||||
}
|
||||
|
||||
public String getApiId() {
|
||||
return apiId;
|
||||
}
|
||||
|
||||
public void setApiId(final String apiId) {
|
||||
this.apiId = apiId;
|
||||
}
|
||||
|
||||
public String getHdfsPath() {
|
||||
return hdfsPath;
|
||||
}
|
||||
|
||||
public void setHdfsPath(final String hdfsPath) {
|
||||
this.hdfsPath = hdfsPath;
|
||||
}
|
||||
|
||||
public Date getCreationDate() {
|
||||
return creationDate;
|
||||
}
|
||||
|
||||
public void setCreationDate(final Date creationDate) {
|
||||
this.creationDate = creationDate;
|
||||
}
|
||||
|
||||
public static MDStore newInstance(
|
||||
final String format,
|
||||
final String layout,
|
||||
final String interpretation,
|
||||
final String hdfsBasePath) {
|
||||
return newInstance(format, layout, interpretation, null, null, null, hdfsBasePath);
|
||||
}
|
||||
|
||||
public static MDStore newInstance(
|
||||
final String format,
|
||||
final String layout,
|
||||
final String interpretation,
|
||||
final String dsName,
|
||||
final String dsId,
|
||||
final String apiId,
|
||||
final String hdfsBasePath) {
|
||||
|
||||
final String mdId = "md-" + UUID.randomUUID();
|
||||
|
||||
final MDStore md = new MDStore();
|
||||
md.setId(mdId);
|
||||
md.setFormat(format);
|
||||
md.setLayout(layout);
|
||||
md.setInterpretation(interpretation);
|
||||
md.setCreationDate(new Date());
|
||||
md.setDatasourceName(dsName);
|
||||
md.setDatasourceId(dsId);
|
||||
md.setApiId(apiId);
|
||||
md.setHdfsPath(String.format("%s/%s", hdfsBasePath, mdId));
|
||||
|
||||
return md;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String
|
||||
.format(
|
||||
"MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]",
|
||||
id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (!(obj instanceof MDStore)) {
|
||||
return false;
|
||||
}
|
||||
final MDStore other = (MDStore) obj;
|
||||
return Objects.equals(id, other.id);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
|
||||
package eu.dnetlib.data.mdstore.manager.common.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
|
||||
import javax.persistence.Column;
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Table;
|
||||
|
||||
@Entity
|
||||
@Table(name = "mdstore_current_versions")
|
||||
public class MDStoreCurrentVersion implements Serializable {
|
||||
|
||||
/** */
|
||||
private static final long serialVersionUID = -4757725888593745773L;
|
||||
|
||||
@Id
|
||||
@Column(name = "mdstore")
|
||||
private String mdstore;
|
||||
|
||||
@Column(name = "current_version")
|
||||
private String currentVersion;
|
||||
|
||||
public String getMdstore() {
|
||||
return mdstore;
|
||||
}
|
||||
|
||||
public void setMdstore(final String mdstore) {
|
||||
this.mdstore = mdstore;
|
||||
}
|
||||
|
||||
public String getCurrentVersion() {
|
||||
return currentVersion;
|
||||
}
|
||||
|
||||
public void setCurrentVersion(final String currentVersion) {
|
||||
this.currentVersion = currentVersion;
|
||||
}
|
||||
|
||||
public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) {
|
||||
final MDStoreCurrentVersion cv = new MDStoreCurrentVersion();
|
||||
cv.setMdstore(mdId);
|
||||
cv.setCurrentVersion(versionId);
|
||||
return cv;
|
||||
}
|
||||
|
||||
public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) {
|
||||
return newInstance(v.getMdstore(), v.getId());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("MDStoreCurrentVersion [mdstore=%s, currentVersion=%s]", mdstore, currentVersion);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(currentVersion, mdstore);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (!(obj instanceof MDStoreCurrentVersion)) {
|
||||
return false;
|
||||
}
|
||||
final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj;
|
||||
return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore);
|
||||
}
|
||||
}
|
|
@ -1,140 +0,0 @@
|
|||
|
||||
package eu.dnetlib.data.mdstore.manager.common.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
import java.util.Objects;
|
||||
|
||||
import javax.persistence.Column;
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Table;
|
||||
import javax.persistence.Temporal;
|
||||
import javax.persistence.TemporalType;
|
||||
|
||||
@Entity
|
||||
@Table(name = "mdstore_versions")
|
||||
public class MDStoreVersion implements Serializable {
|
||||
|
||||
/** */
|
||||
private static final long serialVersionUID = -4763494442274298339L;
|
||||
|
||||
@Id
|
||||
@Column(name = "id")
|
||||
private String id;
|
||||
|
||||
@Column(name = "mdstore")
|
||||
private String mdstore;
|
||||
|
||||
@Column(name = "writing")
|
||||
private boolean writing;
|
||||
|
||||
@Column(name = "readcount")
|
||||
private int readCount = 0;
|
||||
|
||||
@Column(name = "lastupdate")
|
||||
@Temporal(TemporalType.TIMESTAMP)
|
||||
private Date lastUpdate;
|
||||
|
||||
@Column(name = "size")
|
||||
private long size = 0;
|
||||
|
||||
@Column(name = "hdfs_path")
|
||||
private String hdfsPath;
|
||||
|
||||
public static MDStoreVersion newInstance(final String mdId, final boolean writing, final String hdfsBasePath) {
|
||||
final MDStoreVersion v = new MDStoreVersion();
|
||||
|
||||
final String versionId = mdId + "-" + new Date().getTime();
|
||||
v.setId(versionId);
|
||||
v.setMdstore(mdId);
|
||||
v.setLastUpdate(null);
|
||||
v.setWriting(writing);
|
||||
v.setReadCount(0);
|
||||
v.setSize(0);
|
||||
v.setHdfsPath(String.format("%s/%s/%s", hdfsBasePath, mdId, versionId));
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getMdstore() {
|
||||
return mdstore;
|
||||
}
|
||||
|
||||
public void setMdstore(final String mdstore) {
|
||||
this.mdstore = mdstore;
|
||||
}
|
||||
|
||||
public boolean isWriting() {
|
||||
return writing;
|
||||
}
|
||||
|
||||
public void setWriting(final boolean writing) {
|
||||
this.writing = writing;
|
||||
}
|
||||
|
||||
public int getReadCount() {
|
||||
return readCount;
|
||||
}
|
||||
|
||||
public void setReadCount(final int readCount) {
|
||||
this.readCount = readCount;
|
||||
}
|
||||
|
||||
public Date getLastUpdate() {
|
||||
return lastUpdate;
|
||||
}
|
||||
|
||||
public void setLastUpdate(final Date lastUpdate) {
|
||||
this.lastUpdate = lastUpdate;
|
||||
}
|
||||
|
||||
public long getSize() {
|
||||
return size;
|
||||
}
|
||||
|
||||
public void setSize(final long size) {
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
public String getHdfsPath() {
|
||||
return hdfsPath;
|
||||
}
|
||||
|
||||
public void setHdfsPath(final String hdfsPath) {
|
||||
this.hdfsPath = hdfsPath;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String
|
||||
.format(
|
||||
"MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id,
|
||||
mdstore, writing, readCount, lastUpdate, size, hdfsPath);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (!(obj instanceof MDStoreVersion)) {
|
||||
return false;
|
||||
}
|
||||
final MDStoreVersion other = (MDStoreVersion) obj;
|
||||
return Objects.equals(id, other.id);
|
||||
}
|
||||
}
|
|
@ -1,194 +0,0 @@
|
|||
|
||||
package eu.dnetlib.data.mdstore.manager.common.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
import java.util.Objects;
|
||||
|
||||
import javax.persistence.Column;
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Table;
|
||||
import javax.persistence.Temporal;
|
||||
import javax.persistence.TemporalType;
|
||||
|
||||
@Entity
|
||||
@Table(name = "mdstores_with_info")
|
||||
public class MDStoreWithInfo implements Serializable {
|
||||
|
||||
/** */
|
||||
private static final long serialVersionUID = -8445784770687571492L;
|
||||
|
||||
@Id
|
||||
@Column(name = "id")
|
||||
private String id;
|
||||
|
||||
@Column(name = "format")
|
||||
private String format;
|
||||
|
||||
@Column(name = "layout")
|
||||
private String layout;
|
||||
|
||||
@Column(name = "interpretation")
|
||||
private String interpretation;
|
||||
|
||||
@Column(name = "datasource_name")
|
||||
private String datasourceName;
|
||||
|
||||
@Column(name = "datasource_id")
|
||||
private String datasourceId;
|
||||
|
||||
@Column(name = "api_id")
|
||||
private String apiId;
|
||||
|
||||
@Column(name = "current_version")
|
||||
private String currentVersion;
|
||||
|
||||
@Column(name = "creation_date")
|
||||
@Temporal(TemporalType.TIMESTAMP)
|
||||
private Date creationDate;
|
||||
|
||||
@Column(name = "lastupdate")
|
||||
@Temporal(TemporalType.TIMESTAMP)
|
||||
private Date lastUpdate;
|
||||
|
||||
@Column(name = "size")
|
||||
private long size = 0;
|
||||
|
||||
@Column(name = "n_versions")
|
||||
private long numberOfVersions = 0;
|
||||
|
||||
@Column(name = "hdfs_path")
|
||||
private String hdfsPath;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getFormat() {
|
||||
return format;
|
||||
}
|
||||
|
||||
public void setFormat(final String format) {
|
||||
this.format = format;
|
||||
}
|
||||
|
||||
public String getLayout() {
|
||||
return layout;
|
||||
}
|
||||
|
||||
public void setLayout(final String layout) {
|
||||
this.layout = layout;
|
||||
}
|
||||
|
||||
public String getInterpretation() {
|
||||
return interpretation;
|
||||
}
|
||||
|
||||
public void setInterpretation(final String interpretation) {
|
||||
this.interpretation = interpretation;
|
||||
}
|
||||
|
||||
public String getDatasourceName() {
|
||||
return datasourceName;
|
||||
}
|
||||
|
||||
public void setDatasourceName(final String datasourceName) {
|
||||
this.datasourceName = datasourceName;
|
||||
}
|
||||
|
||||
public String getDatasourceId() {
|
||||
return datasourceId;
|
||||
}
|
||||
|
||||
public void setDatasourceId(final String datasourceId) {
|
||||
this.datasourceId = datasourceId;
|
||||
}
|
||||
|
||||
public String getApiId() {
|
||||
return apiId;
|
||||
}
|
||||
|
||||
public void setApiId(final String apiId) {
|
||||
this.apiId = apiId;
|
||||
}
|
||||
|
||||
public String getCurrentVersion() {
|
||||
return currentVersion;
|
||||
}
|
||||
|
||||
public void setCurrentVersion(final String currentVersion) {
|
||||
this.currentVersion = currentVersion;
|
||||
}
|
||||
|
||||
public Date getCreationDate() {
|
||||
return creationDate;
|
||||
}
|
||||
|
||||
public void setCreationDate(final Date creationDate) {
|
||||
this.creationDate = creationDate;
|
||||
}
|
||||
|
||||
public Date getLastUpdate() {
|
||||
return lastUpdate;
|
||||
}
|
||||
|
||||
public void setLastUpdate(final Date lastUpdate) {
|
||||
this.lastUpdate = lastUpdate;
|
||||
}
|
||||
|
||||
public long getSize() {
|
||||
return size;
|
||||
}
|
||||
|
||||
public void setSize(final long size) {
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
public long getNumberOfVersions() {
|
||||
return numberOfVersions;
|
||||
}
|
||||
|
||||
public void setNumberOfVersions(final long numberOfVersions) {
|
||||
this.numberOfVersions = numberOfVersions;
|
||||
}
|
||||
|
||||
public String getHdfsPath() {
|
||||
return hdfsPath;
|
||||
}
|
||||
|
||||
public void setHdfsPath(final String hdfsPath) {
|
||||
this.hdfsPath = hdfsPath;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String
|
||||
.format(
|
||||
"MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]",
|
||||
id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate,
|
||||
lastUpdate, size, numberOfVersions, hdfsPath);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (!(obj instanceof MDStoreWithInfo)) {
|
||||
return false;
|
||||
}
|
||||
final MDStoreWithInfo other = (MDStoreWithInfo) obj;
|
||||
return Objects.equals(id, other.id);
|
||||
}
|
||||
|
||||
}
|
|
@ -14,7 +14,7 @@ public class DbClient implements Closeable {
|
|||
|
||||
private static final Log log = LogFactory.getLog(DbClient.class);
|
||||
|
||||
private Connection connection;
|
||||
private final Connection connection;
|
||||
|
||||
public DbClient(final String address, final String login, final String password) {
|
||||
|
||||
|
|
|
@ -100,7 +100,7 @@ public class MakeTarArchive implements Serializable {
|
|||
BufferedInputStream bis = new BufferedInputStream(is);
|
||||
|
||||
int count;
|
||||
byte data[] = new byte[1024];
|
||||
byte[] data = new byte[1024];
|
||||
while ((count = bis.read(data, 0, data.length)) != -1) {
|
||||
ar.write(data, 0, count);
|
||||
}
|
||||
|
|
|
@ -13,9 +13,9 @@ import okio.Source;
|
|||
|
||||
public class InputStreamRequestBody extends RequestBody {
|
||||
|
||||
private InputStream inputStream;
|
||||
private MediaType mediaType;
|
||||
private long lenght;
|
||||
private final InputStream inputStream;
|
||||
private final MediaType mediaType;
|
||||
private final long lenght;
|
||||
|
||||
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ public class DNetRestClient {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class);
|
||||
|
||||
private static ObjectMapper mapper = new ObjectMapper();
|
||||
private static final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
|
||||
final HttpGet httpGet = new HttpGet(url);
|
||||
|
|
|
@ -34,7 +34,7 @@ public class MessageSender {
|
|||
|
||||
private final String workflowId;
|
||||
|
||||
private ExecutorService executorService = Executors.newCachedThreadPool();
|
||||
private final ExecutorService executorService = Executors.newCachedThreadPool();
|
||||
|
||||
public MessageSender(final String dnetMessageEndpoint, final String workflowId) {
|
||||
this.workflowId = workflowId;
|
||||
|
|
|
@ -0,0 +1,459 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.ZoneId;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.time.format.DateTimeParseException;
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class GraphCleaningFunctions extends CleaningFunctions {
|
||||
|
||||
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
|
||||
public static final int ORCID_LEN = 19;
|
||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
||||
public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
|
||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
|
||||
|
||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Organization) {
|
||||
Organization o = (Organization) value;
|
||||
if (Objects.nonNull(o.getCountry())) {
|
||||
fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE);
|
||||
}
|
||||
} else if (value instanceof Relation) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Result) {
|
||||
|
||||
Result r = (Result) value;
|
||||
|
||||
fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES);
|
||||
fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE);
|
||||
fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES);
|
||||
|
||||
if (Objects.nonNull(r.getSubject())) {
|
||||
r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES));
|
||||
}
|
||||
if (Objects.nonNull(r.getInstance())) {
|
||||
for (Instance i : r.getInstance()) {
|
||||
fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES);
|
||||
fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS);
|
||||
}
|
||||
}
|
||||
if (Objects.nonNull(r.getAuthor())) {
|
||||
r.getAuthor().stream().filter(Objects::nonNull).forEach(a -> {
|
||||
if (Objects.nonNull(a.getPid())) {
|
||||
a.getPid().stream().filter(Objects::nonNull).forEach(p -> {
|
||||
fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
if (value instanceof Publication) {
|
||||
|
||||
} else if (value instanceof Dataset) {
|
||||
|
||||
} else if (value instanceof OtherResearchProduct) {
|
||||
|
||||
} else if (value instanceof Software) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
public static <T extends Oaf> boolean filter(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to evaluate here
|
||||
} else if (value instanceof Project) {
|
||||
// nothing to evaluate here
|
||||
} else if (value instanceof Organization) {
|
||||
// nothing to evaluate here
|
||||
} else if (value instanceof Relation) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Result) {
|
||||
|
||||
Result r = (Result) value;
|
||||
|
||||
if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (value instanceof Publication) {
|
||||
|
||||
} else if (value instanceof Dataset) {
|
||||
|
||||
} else if (value instanceof OtherResearchProduct) {
|
||||
|
||||
} else if (value instanceof Software) {
|
||||
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T cleanup(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Organization) {
|
||||
Organization o = (Organization) value;
|
||||
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
||||
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
|
||||
}
|
||||
} else if (value instanceof Relation) {
|
||||
Relation r = (Relation) value;
|
||||
|
||||
Optional<String> validationDate = doCleanDate(r.getValidationDate());
|
||||
if (validationDate.isPresent()) {
|
||||
r.setValidationDate(validationDate.get());
|
||||
r.setValidated(true);
|
||||
} else {
|
||||
r.setValidationDate(null);
|
||||
r.setValidated(false);
|
||||
}
|
||||
} else if (value instanceof Result) {
|
||||
|
||||
Result r = (Result) value;
|
||||
|
||||
if (Objects.nonNull(r.getDateofacceptance())) {
|
||||
Optional<String> date = cleanDateField(r.getDateofacceptance());
|
||||
if (date.isPresent()) {
|
||||
r.getDateofacceptance().setValue(date.get());
|
||||
} else {
|
||||
r.setDateofacceptance(null);
|
||||
}
|
||||
}
|
||||
if (Objects.nonNull(r.getRelevantdate())) {
|
||||
r
|
||||
.setRelevantdate(
|
||||
r
|
||||
.getRelevantdate()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(sp -> {
|
||||
sp.setValue(GraphCleaningFunctions.cleanDate(sp.getValue()));
|
||||
return sp;
|
||||
})
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
|
||||
r.setPublisher(null);
|
||||
}
|
||||
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
|
||||
r
|
||||
.setLanguage(
|
||||
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
|
||||
}
|
||||
if (Objects.nonNull(r.getSubject())) {
|
||||
r
|
||||
.setSubject(
|
||||
r
|
||||
.getSubject()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getTitle())) {
|
||||
r
|
||||
.setTitle(
|
||||
r
|
||||
.getTitle()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.filter(
|
||||
sp -> sp
|
||||
.getValue()
|
||||
.toLowerCase()
|
||||
.replaceAll(TITLE_FILTER_REGEX, "")
|
||||
.length() > TITLE_FILTER_RESIDUAL_LENGTH)
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getDescription())) {
|
||||
r
|
||||
.setDescription(
|
||||
r
|
||||
.getDescription()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getPid())) {
|
||||
r.setPid(processPidCleaning(r.getPid()));
|
||||
}
|
||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||
r
|
||||
.setResourcetype(
|
||||
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
||||
}
|
||||
if (Objects.nonNull(r.getInstance())) {
|
||||
|
||||
for (Instance i : r.getInstance()) {
|
||||
if (Objects.nonNull(i.getPid())) {
|
||||
i.setPid(processPidCleaning(i.getPid()));
|
||||
}
|
||||
if (Objects.nonNull(i.getAlternateIdentifier())) {
|
||||
i.setAlternateIdentifier(processPidCleaning(i.getAlternateIdentifier()));
|
||||
}
|
||||
Optional
|
||||
.ofNullable(i.getPid())
|
||||
.ifPresent(pid -> {
|
||||
final Set<StructuredProperty> pids = Sets.newHashSet(pid);
|
||||
Optional
|
||||
.ofNullable(i.getAlternateIdentifier())
|
||||
.ifPresent(altId -> {
|
||||
final Set<StructuredProperty> altIds = Sets.newHashSet(altId);
|
||||
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
|
||||
});
|
||||
});
|
||||
|
||||
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
||||
i
|
||||
.setAccessright(
|
||||
accessRight(
|
||||
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
|
||||
ModelConstants.DNET_ACCESS_MODES));
|
||||
}
|
||||
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
|
||||
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
|
||||
}
|
||||
if (Objects.isNull(i.getRefereed())) {
|
||||
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
|
||||
}
|
||||
if (Objects.nonNull(i.getDateofacceptance())) {
|
||||
Optional<String> date = cleanDateField(i.getDateofacceptance());
|
||||
if (date.isPresent()) {
|
||||
i.getDateofacceptance().setValue(date.get());
|
||||
} else {
|
||||
i.setDateofacceptance(null);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
||||
Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
|
||||
if (Objects.isNull(bestaccessrights)) {
|
||||
r
|
||||
.setBestaccessright(
|
||||
qualifier(
|
||||
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
|
||||
ModelConstants.DNET_ACCESS_MODES));
|
||||
} else {
|
||||
r.setBestaccessright(bestaccessrights);
|
||||
}
|
||||
}
|
||||
if (Objects.nonNull(r.getAuthor())) {
|
||||
r
|
||||
.setAuthor(
|
||||
r
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.filter(a -> Objects.nonNull(a))
|
||||
.filter(a -> StringUtils.isNotBlank(a.getFullname()))
|
||||
.filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
boolean nullRank = r
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.anyMatch(a -> Objects.isNull(a.getRank()));
|
||||
if (nullRank) {
|
||||
int i = 1;
|
||||
for (Author author : r.getAuthor()) {
|
||||
author.setRank(i++);
|
||||
}
|
||||
}
|
||||
|
||||
for (Author a : r.getAuthor()) {
|
||||
if (Objects.isNull(a.getPid())) {
|
||||
a.setPid(Lists.newArrayList());
|
||||
} else {
|
||||
a
|
||||
.setPid(
|
||||
a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(p -> Objects.nonNull(p.getQualifier()))
|
||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
.map(p -> {
|
||||
// hack to distinguish orcid from orcid_pending
|
||||
String pidProvenance = Optional
|
||||
.ofNullable(p.getDataInfo())
|
||||
.map(
|
||||
d -> Optional
|
||||
.ofNullable(d.getProvenanceaction())
|
||||
.map(Qualifier::getClassid)
|
||||
.orElse(""))
|
||||
.orElse("");
|
||||
if (p
|
||||
.getQualifier()
|
||||
.getClassid()
|
||||
.toLowerCase()
|
||||
.contains(ModelConstants.ORCID)) {
|
||||
if (pidProvenance
|
||||
.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
|
||||
p.getQualifier().setClassid(ModelConstants.ORCID);
|
||||
} else {
|
||||
p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
|
||||
}
|
||||
final String orcid = p
|
||||
.getValue()
|
||||
.trim()
|
||||
.toLowerCase()
|
||||
.replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4");
|
||||
if (orcid.length() == ORCID_LEN) {
|
||||
p.setValue(orcid);
|
||||
} else {
|
||||
p.setValue("");
|
||||
}
|
||||
}
|
||||
return p;
|
||||
})
|
||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
.collect(
|
||||
Collectors
|
||||
.toMap(
|
||||
p -> p.getQualifier().getClassid() + p.getValue(),
|
||||
Function.identity(),
|
||||
(p1, p2) -> p1,
|
||||
LinkedHashMap::new))
|
||||
.values()
|
||||
.stream()
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (value instanceof Publication) {
|
||||
|
||||
} else if (value instanceof Dataset) {
|
||||
|
||||
} else if (value instanceof OtherResearchProduct) {
|
||||
|
||||
} else if (value instanceof Software) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
|
||||
return Optional
|
||||
.ofNullable(dateofacceptance)
|
||||
.map(Field::getValue)
|
||||
.map(GraphCleaningFunctions::cleanDate)
|
||||
.filter(Objects::nonNull);
|
||||
}
|
||||
|
||||
protected static Optional<String> doCleanDate(String date) {
|
||||
return Optional.ofNullable(cleanDate(date));
|
||||
}
|
||||
|
||||
public static String cleanDate(final String inputDate) {
|
||||
|
||||
if (StringUtils.isBlank(inputDate)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
final LocalDate date = DateParserUtils
|
||||
.parseDate(inputDate.trim())
|
||||
.toInstant()
|
||||
.atZone(ZoneId.systemDefault())
|
||||
.toLocalDate();
|
||||
return DateTimeFormatter.ofPattern(ModelSupport.DATE_FORMAT).format(date);
|
||||
} catch (DateTimeParseException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// HELPERS
|
||||
|
||||
private static boolean isValidAuthorName(Author a) {
|
||||
return !Stream
|
||||
.of(a.getFullname(), a.getName(), a.getSurname())
|
||||
.filter(s -> s != null && !s.isEmpty())
|
||||
.collect(Collectors.joining(""))
|
||||
.toLowerCase()
|
||||
.matches(INVALID_AUTHOR_REGEX);
|
||||
}
|
||||
|
||||
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
|
||||
return pids
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
|
||||
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.filter(CleaningFunctions::pidFilter)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static void fixVocabName(Qualifier q, String vocabularyName) {
|
||||
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
|
||||
q.setSchemeid(vocabularyName);
|
||||
q.setSchemename(vocabularyName);
|
||||
}
|
||||
}
|
||||
|
||||
private static AccessRight accessRight(String classid, String classname, String scheme) {
|
||||
return OafMapperUtils
|
||||
.accessRight(
|
||||
classid, classname, scheme, scheme);
|
||||
}
|
||||
|
||||
private static Qualifier qualifier(String classid, String classname, String scheme) {
|
||||
return OafMapperUtils
|
||||
.qualifier(
|
||||
classid, classname, scheme, scheme);
|
||||
}
|
||||
|
||||
protected static StructuredProperty cleanValue(StructuredProperty s) {
|
||||
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
||||
return s;
|
||||
}
|
||||
|
||||
protected static Field<String> cleanValue(Field<String> s) {
|
||||
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,368 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class OafMapperUtils {
|
||||
|
||||
public static Oaf merge(final Oaf left, final Oaf right) {
|
||||
if (ModelSupport.isSubClass(left, OafEntity.class)) {
|
||||
return mergeEntities((OafEntity) left, (OafEntity) right);
|
||||
} else if (ModelSupport.isSubClass(left, Relation.class)) {
|
||||
((Relation) left).mergeFrom((Relation) right);
|
||||
} else {
|
||||
throw new RuntimeException("invalid Oaf type:" + left.getClass().getCanonicalName());
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
public static OafEntity mergeEntities(OafEntity left, OafEntity right) {
|
||||
if (ModelSupport.isSubClass(left, Result.class)) {
|
||||
return mergeResults((Result) left, (Result) right);
|
||||
} else if (ModelSupport.isSubClass(left, Datasource.class)) {
|
||||
left.mergeFrom(right);
|
||||
} else if (ModelSupport.isSubClass(left, Organization.class)) {
|
||||
left.mergeFrom(right);
|
||||
} else if (ModelSupport.isSubClass(left, Project.class)) {
|
||||
left.mergeFrom(right);
|
||||
} else {
|
||||
throw new RuntimeException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
public static Result mergeResults(Result left, Result right) {
|
||||
if (new ResultTypeComparator().compare(left, right) < 0) {
|
||||
left.mergeFrom(right);
|
||||
return left;
|
||||
} else {
|
||||
right.mergeFrom(left);
|
||||
return right;
|
||||
}
|
||||
}
|
||||
|
||||
public static KeyValue keyValue(final String k, final String v) {
|
||||
final KeyValue kv = new KeyValue();
|
||||
kv.setKey(k);
|
||||
kv.setValue(v);
|
||||
return kv;
|
||||
}
|
||||
|
||||
public static List<KeyValue> listKeyValues(final String... s) {
|
||||
if (s.length % 2 > 0) {
|
||||
throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)");
|
||||
}
|
||||
|
||||
final List<KeyValue> list = new ArrayList<>();
|
||||
for (int i = 0; i < s.length; i += 2) {
|
||||
list.add(keyValue(s[i], s[i + 1]));
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public static <T> Field<T> field(final T value, final DataInfo info) {
|
||||
if (value == null || StringUtils.isBlank(value.toString())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final Field<T> field = new Field<>();
|
||||
field.setValue(value);
|
||||
field.setDataInfo(info);
|
||||
return field;
|
||||
}
|
||||
|
||||
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
|
||||
return Arrays
|
||||
.stream(values)
|
||||
.map(v -> field(v, info))
|
||||
.filter(Objects::nonNull)
|
||||
.filter(distinctByKey(f -> f.getValue()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
|
||||
return values
|
||||
.stream()
|
||||
.map(v -> field(v, info))
|
||||
.filter(Objects::nonNull)
|
||||
.filter(distinctByKey(f -> f.getValue()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static Qualifier unknown(final String schemeid, final String schemename) {
|
||||
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
|
||||
}
|
||||
|
||||
public static AccessRight accessRight(
|
||||
final String classid,
|
||||
final String classname,
|
||||
final String schemeid,
|
||||
final String schemename) {
|
||||
return accessRight(classid, classname, schemeid, schemename, null);
|
||||
}
|
||||
|
||||
public static AccessRight accessRight(
|
||||
final String classid,
|
||||
final String classname,
|
||||
final String schemeid,
|
||||
final String schemename,
|
||||
final OpenAccessRoute openAccessRoute) {
|
||||
final AccessRight accessRight = new AccessRight();
|
||||
accessRight.setClassid(classid);
|
||||
accessRight.setClassname(classname);
|
||||
accessRight.setSchemeid(schemeid);
|
||||
accessRight.setSchemename(schemename);
|
||||
accessRight.setOpenAccessRoute(openAccessRoute);
|
||||
return accessRight;
|
||||
}
|
||||
|
||||
public static Qualifier qualifier(
|
||||
final String classid,
|
||||
final String classname,
|
||||
final String schemeid,
|
||||
final String schemename) {
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(classid);
|
||||
q.setClassname(classname);
|
||||
q.setSchemeid(schemeid);
|
||||
q.setSchemename(schemename);
|
||||
return q;
|
||||
}
|
||||
|
||||
public static Qualifier qualifier(final Qualifier qualifier) {
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(qualifier.getClassid());
|
||||
q.setClassname(qualifier.getClassname());
|
||||
q.setSchemeid(qualifier.getSchemeid());
|
||||
q.setSchemename(qualifier.getSchemename());
|
||||
return q;
|
||||
}
|
||||
|
||||
public static StructuredProperty structuredProperty(
|
||||
final String value,
|
||||
final String classid,
|
||||
final String classname,
|
||||
final String schemeid,
|
||||
final String schemename,
|
||||
final DataInfo dataInfo) {
|
||||
|
||||
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
|
||||
}
|
||||
|
||||
public static StructuredProperty structuredProperty(
|
||||
final String value,
|
||||
final Qualifier qualifier,
|
||||
final DataInfo dataInfo) {
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
final StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(value);
|
||||
sp.setQualifier(qualifier);
|
||||
sp.setDataInfo(dataInfo);
|
||||
return sp;
|
||||
}
|
||||
|
||||
public static ExtraInfo extraInfo(
|
||||
final String name,
|
||||
final String value,
|
||||
final String typology,
|
||||
final String provenance,
|
||||
final String trust) {
|
||||
final ExtraInfo info = new ExtraInfo();
|
||||
info.setName(name);
|
||||
info.setValue(value);
|
||||
info.setTypology(typology);
|
||||
info.setProvenance(provenance);
|
||||
info.setTrust(trust);
|
||||
return info;
|
||||
}
|
||||
|
||||
public static OAIProvenance oaiIProvenance(
|
||||
final String identifier,
|
||||
final String baseURL,
|
||||
final String metadataNamespace,
|
||||
final Boolean altered,
|
||||
final String datestamp,
|
||||
final String harvestDate) {
|
||||
|
||||
final OriginDescription desc = new OriginDescription();
|
||||
desc.setIdentifier(identifier);
|
||||
desc.setBaseURL(baseURL);
|
||||
desc.setMetadataNamespace(metadataNamespace);
|
||||
desc.setAltered(altered);
|
||||
desc.setDatestamp(datestamp);
|
||||
desc.setHarvestDate(harvestDate);
|
||||
|
||||
final OAIProvenance p = new OAIProvenance();
|
||||
p.setOriginDescription(desc);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
public static Journal journal(
|
||||
final String name,
|
||||
final String issnPrinted,
|
||||
final String issnOnline,
|
||||
final String issnLinking,
|
||||
final DataInfo dataInfo) {
|
||||
|
||||
return hasIssn(issnPrinted, issnOnline, issnLinking) ? journal(
|
||||
name,
|
||||
issnPrinted,
|
||||
issnOnline,
|
||||
issnLinking,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
dataInfo) : null;
|
||||
}
|
||||
|
||||
public static Journal journal(
|
||||
final String name,
|
||||
final String issnPrinted,
|
||||
final String issnOnline,
|
||||
final String issnLinking,
|
||||
final String ep,
|
||||
final String iss,
|
||||
final String sp,
|
||||
final String vol,
|
||||
final String edition,
|
||||
final String conferenceplace,
|
||||
final String conferencedate,
|
||||
final DataInfo dataInfo) {
|
||||
|
||||
if (StringUtils.isNotBlank(name) || hasIssn(issnPrinted, issnOnline, issnLinking)) {
|
||||
final Journal j = new Journal();
|
||||
j.setName(name);
|
||||
j.setIssnPrinted(issnPrinted);
|
||||
j.setIssnOnline(issnOnline);
|
||||
j.setIssnLinking(issnLinking);
|
||||
j.setEp(ep);
|
||||
j.setIss(iss);
|
||||
j.setSp(sp);
|
||||
j.setVol(vol);
|
||||
j.setEdition(edition);
|
||||
j.setConferenceplace(conferenceplace);
|
||||
j.setConferencedate(conferencedate);
|
||||
j.setDataInfo(dataInfo);
|
||||
return j;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean hasIssn(String issnPrinted, String issnOnline, String issnLinking) {
|
||||
return StringUtils.isNotBlank(issnPrinted)
|
||||
|| StringUtils.isNotBlank(issnOnline)
|
||||
|| StringUtils.isNotBlank(issnLinking);
|
||||
}
|
||||
|
||||
public static DataInfo dataInfo(
|
||||
final Boolean deletedbyinference,
|
||||
final String inferenceprovenance,
|
||||
final Boolean inferred,
|
||||
final Boolean invisible,
|
||||
final Qualifier provenanceaction,
|
||||
final String trust) {
|
||||
final DataInfo d = new DataInfo();
|
||||
d.setDeletedbyinference(deletedbyinference);
|
||||
d.setInferenceprovenance(inferenceprovenance);
|
||||
d.setInferred(inferred);
|
||||
d.setInvisible(invisible);
|
||||
d.setProvenanceaction(provenanceaction);
|
||||
d.setTrust(trust);
|
||||
return d;
|
||||
}
|
||||
|
||||
public static String createOpenaireId(
|
||||
final int prefix,
|
||||
final String originalId,
|
||||
final boolean to_md5) {
|
||||
if (StringUtils.isBlank(originalId)) {
|
||||
return null;
|
||||
} else if (to_md5) {
|
||||
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
|
||||
final String rest = StringUtils.substringAfter(originalId, "::");
|
||||
return String.format("%s|%s::%s", prefix, nsPrefix, IdentifierFactory.md5(rest));
|
||||
} else {
|
||||
return String.format("%s|%s", prefix, originalId);
|
||||
}
|
||||
}
|
||||
|
||||
public static String createOpenaireId(
|
||||
final String type,
|
||||
final String originalId,
|
||||
final boolean to_md5) {
|
||||
switch (type) {
|
||||
case "datasource":
|
||||
return createOpenaireId(10, originalId, to_md5);
|
||||
case "organization":
|
||||
return createOpenaireId(20, originalId, to_md5);
|
||||
case "person":
|
||||
return createOpenaireId(30, originalId, to_md5);
|
||||
case "project":
|
||||
return createOpenaireId(40, originalId, to_md5);
|
||||
default:
|
||||
return createOpenaireId(50, originalId, to_md5);
|
||||
}
|
||||
}
|
||||
|
||||
public static String asString(final Object o) {
|
||||
return o == null ? "" : o.toString();
|
||||
}
|
||||
|
||||
public static <T> Predicate<T> distinctByKey(
|
||||
final Function<? super T, ?> keyExtractor) {
|
||||
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
|
||||
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
|
||||
}
|
||||
|
||||
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
|
||||
return getBestAccessRights(instanceList);
|
||||
}
|
||||
|
||||
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
|
||||
if (instanceList != null) {
|
||||
final Optional<AccessRight> min = instanceList
|
||||
.stream()
|
||||
.map(i -> i.getAccessright())
|
||||
.min(new AccessRightComparator<>());
|
||||
|
||||
final Qualifier rights = min.isPresent() ? qualifier(min.get()) : new Qualifier();
|
||||
|
||||
if (StringUtils.isBlank(rights.getClassid())) {
|
||||
rights.setClassid(UNKNOWN);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getClassname())
|
||||
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
|
||||
rights.setClassname(NOT_AVAILABLE);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getSchemeid())) {
|
||||
rights.setSchemeid(DNET_ACCESS_MODES);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getSchemename())) {
|
||||
rights.setSchemename(DNET_ACCESS_MODES);
|
||||
}
|
||||
|
||||
return rights;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -1,11 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.utils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import javax.xml.ws.BindingProvider;
|
||||
|
||||
import org.apache.cxf.endpoint.Client;
|
||||
import org.apache.cxf.frontend.ClientProxy;
|
||||
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
|
||||
import org.apache.cxf.transport.http.HTTPConduit;
|
||||
import org.apache.cxf.transports.http.configuration.HTTPClientPolicy;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -15,8 +15,8 @@ public class ISLookupClientFactory {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class);
|
||||
|
||||
private static int requestTimeout = 60000 * 10;
|
||||
private static int connectTimeout = 60000 * 10;
|
||||
private static final int requestTimeout = 60000 * 10;
|
||||
private static final int connectTimeout = 60000 * 10;
|
||||
|
||||
public static ISLookUpService getLookUpService(final String isLookupUrl) {
|
||||
return getServiceStub(ISLookUpService.class, isLookupUrl);
|
||||
|
@ -31,20 +31,23 @@ public class ISLookupClientFactory {
|
|||
|
||||
final T service = (T) jaxWsProxyFactory.create();
|
||||
|
||||
if (service instanceof BindingProvider) {
|
||||
Client client = ClientProxy.getClient(service);
|
||||
if (client != null) {
|
||||
HTTPConduit conduit = (HTTPConduit) client.getConduit();
|
||||
HTTPClientPolicy policy = new HTTPClientPolicy();
|
||||
|
||||
log
|
||||
.info(
|
||||
"setting timeouts for {} to requestTimeout: {}, connectTimeout: {}",
|
||||
BindingProvider.class.getName(), requestTimeout, connectTimeout);
|
||||
String
|
||||
.format(
|
||||
"setting connectTimeout to %s, requestTimeout to %s for service %s",
|
||||
connectTimeout,
|
||||
requestTimeout,
|
||||
clazz.getCanonicalName()));
|
||||
|
||||
Map<String, Object> requestContext = ((BindingProvider) service).getRequestContext();
|
||||
|
||||
requestContext.put("com.sun.xml.internal.ws.request.timeout", requestTimeout);
|
||||
requestContext.put("com.sun.xml.internal.ws.connect.timeout", connectTimeout);
|
||||
requestContext.put("com.sun.xml.ws.request.timeout", requestTimeout);
|
||||
requestContext.put("com.sun.xml.ws.connect.timeout", connectTimeout);
|
||||
requestContext.put("javax.xml.ws.client.receiveTimeout", requestTimeout);
|
||||
requestContext.put("javax.xml.ws.client.connectionTimeout", connectTimeout);
|
||||
policy.setConnectionTimeout(connectTimeout);
|
||||
policy.setReceiveTimeout(requestTimeout);
|
||||
conduit.setClient(policy);
|
||||
}
|
||||
|
||||
return service;
|
||||
|
|
|
@ -0,0 +1,180 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.time.LocalDate;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class OafMapperUtilsTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
@Test
|
||||
public void testDateValidation() {
|
||||
|
||||
assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z ").isPresent());
|
||||
assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent());
|
||||
assertTrue(GraphCleaningFunctions.doCleanDate(" 2016-04-05").isPresent());
|
||||
|
||||
assertEquals("2016-04-05", GraphCleaningFunctions.doCleanDate("2016 Apr 05").get());
|
||||
|
||||
assertEquals("2009-05-08", GraphCleaningFunctions.doCleanDate("May 8, 2009 5:57:51 PM").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, 1970").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, '70").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 1970").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 70").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 2006").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 MST 2006").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 02 15:04:05 -0700 2006").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Monday, 02-Jan-06 15:04:05 MST").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 MST").get());
|
||||
assertEquals("2017-07-11", GraphCleaningFunctions.doCleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 -0700").get());
|
||||
assertEquals("2018-01-04", GraphCleaningFunctions.doCleanDate("Thu, 4 Jan 2018 17:53:36 +0000").get());
|
||||
assertEquals("2015-08-10", GraphCleaningFunctions.doCleanDate("Mon Aug 10 15:44:11 UTC+0100 2015").get());
|
||||
assertEquals(
|
||||
"2015-07-03",
|
||||
GraphCleaningFunctions.doCleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)").get());
|
||||
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 10:09am").get());
|
||||
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 at 10:09am PST-08").get());
|
||||
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012, 10:10:09").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7, 1970").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7th, 1970").get());
|
||||
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006, 19:17").get());
|
||||
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006 19:17").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 70").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 1970").get());
|
||||
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("03 February 2013").get());
|
||||
assertEquals("2013-07-01", GraphCleaningFunctions.doCleanDate("1 July 2013").get());
|
||||
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("2013-Feb-03").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3/31/2014").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03/31/2014").get());
|
||||
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08/21/71").get());
|
||||
assertEquals("1971-01-08", GraphCleaningFunctions.doCleanDate("8/1/71").get());
|
||||
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/2014 22:05").get());
|
||||
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("04/08/2014 22:05").get());
|
||||
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/14 22:05").get());
|
||||
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("04/2/2014 03:00:51").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00:00 AM").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00:01 PM").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00 PM").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 1:00 PM").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00 AM").get());
|
||||
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("4/02/2014 03:00:51").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59.3186369").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/3/31").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/03/31").get());
|
||||
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/4/8 22:05").get());
|
||||
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/04/08 22:05").get());
|
||||
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/04/2 03:00:51").get());
|
||||
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/4/02 03:00:51").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59.3186369").get());
|
||||
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014年04月08日").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
|
||||
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
|
||||
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
|
||||
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
|
||||
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
|
||||
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43").get());
|
||||
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43:22").get());
|
||||
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 UTC").get());
|
||||
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 GMT").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 05:24:37 PM").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800 +08").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:44 +09:00").get());
|
||||
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000 +0000 UTC").get());
|
||||
assertEquals("2015-09-30", GraphCleaningFunctions.doCleanDate("2015-09-30 18:48:56.35272715 +0000 UTC").get());
|
||||
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 GMT").get());
|
||||
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 UTC").get());
|
||||
assertEquals(
|
||||
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001").get());
|
||||
assertEquals(
|
||||
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001").get());
|
||||
assertEquals("2017-07-19", GraphCleaningFunctions.doCleanDate("2017-07-19 03:21:51+00:00").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26").get());
|
||||
assertEquals("2014-04-01", GraphCleaningFunctions.doCleanDate("2014-04").get());
|
||||
assertEquals("2014-01-01", GraphCleaningFunctions.doCleanDate("2014").get());
|
||||
assertEquals("2014-05-11", GraphCleaningFunctions.doCleanDate("2014-05-11 08:20:13,787").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3.31.2014").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03.31.2014").get());
|
||||
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08.21.71").get());
|
||||
assertEquals("2014-03-01", GraphCleaningFunctions.doCleanDate("2014.03").get());
|
||||
assertEquals("2014-03-30", GraphCleaningFunctions.doCleanDate("2014.03.30").get());
|
||||
assertEquals("2014-06-01", GraphCleaningFunctions.doCleanDate("20140601").get());
|
||||
assertEquals("2014-07-22", GraphCleaningFunctions.doCleanDate("20140722105203").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("1332151919").get());
|
||||
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367189").get());
|
||||
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222").get());
|
||||
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222333").get());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDate() {
|
||||
System.out.println(GraphCleaningFunctions.cleanDate("23-FEB-1998"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMergePubs() throws IOException {
|
||||
Publication p1 = read("publication_1.json", Publication.class);
|
||||
Publication p2 = read("publication_2.json", Publication.class);
|
||||
Dataset d1 = read("dataset_1.json", Dataset.class);
|
||||
Dataset d2 = read("dataset_2.json", Dataset.class);
|
||||
|
||||
assertEquals(p1.getCollectedfrom().size(), 1);
|
||||
assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID);
|
||||
assertEquals(d2.getCollectedfrom().size(), 1);
|
||||
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||
|
||||
assertTrue(
|
||||
OafMapperUtils
|
||||
.mergeResults(p1, d2)
|
||||
.getResulttype()
|
||||
.getClassid()
|
||||
.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID));
|
||||
|
||||
assertEquals(p2.getCollectedfrom().size(), 1);
|
||||
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||
assertEquals(d1.getCollectedfrom().size(), 1);
|
||||
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||
|
||||
assertTrue(
|
||||
OafMapperUtils
|
||||
.mergeResults(p2, d1)
|
||||
.getResulttype()
|
||||
.getClassid()
|
||||
.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID));
|
||||
}
|
||||
|
||||
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
|
||||
return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException {
|
||||
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
||||
return OBJECT_MAPPER.readValue(json, clazz);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository A"} ]}
|
|
@ -3,20 +3,23 @@ package eu.dnetlib.dhp.actionmanager;
|
|||
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Triple;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.actionmanager.rmi.ActionManagerException;
|
||||
import eu.dnetlib.actionmanager.set.ActionManagerSet;
|
||||
|
@ -25,6 +28,7 @@ import eu.dnetlib.dhp.actionmanager.partition.PartitionActionSetsByPayloadTypeJo
|
|||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class ISClient implements Serializable {
|
||||
|
||||
|
@ -40,80 +44,52 @@ public class ISClient implements Serializable {
|
|||
|
||||
public List<String> getLatestRawsetPaths(String setIds) {
|
||||
|
||||
List<String> ids = Lists
|
||||
.newArrayList(
|
||||
final Set<String> ids = Sets
|
||||
.newHashSet(
|
||||
Splitter
|
||||
.on(INPUT_ACTION_SET_ID_SEPARATOR)
|
||||
.omitEmptyStrings()
|
||||
.trimResults()
|
||||
.split(setIds));
|
||||
|
||||
return ids
|
||||
.stream()
|
||||
.map(id -> getSet(isLookup, id))
|
||||
.map(as -> as.getPathToLatest())
|
||||
.collect(Collectors.toCollection(ArrayList::new));
|
||||
}
|
||||
|
||||
private ActionManagerSet getSet(ISLookUpService isLookup, final String setId) {
|
||||
|
||||
final String q = "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') "
|
||||
+ "where $x//SET/@id = '"
|
||||
+ setId
|
||||
+ "' return $x";
|
||||
|
||||
try {
|
||||
final String basePath = getBasePathHDFS(isLookup);
|
||||
final String setProfile = isLookup.getResourceProfileByQuery(q);
|
||||
return getActionManagerSet(basePath, setProfile);
|
||||
} catch (ISLookUpException | ActionManagerException e) {
|
||||
throw new RuntimeException("Error accessing Sets, using query: " + q);
|
||||
|
||||
// <SET id="..." directory="..." latest="xxx"/>
|
||||
final String xquery = "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') "
|
||||
+
|
||||
"return <SET id='{$x//SET/@id/string()}' directory='{$x//SET/@directory/string()}' latest='{$x//LATEST/@id/string()}'/>";
|
||||
return Optional
|
||||
.ofNullable(isLookup.quickSearchProfile(xquery))
|
||||
.map(
|
||||
sets -> sets
|
||||
.stream()
|
||||
.map(set -> parseSetInfo(set))
|
||||
.filter(t -> ids.contains(t.getLeft()))
|
||||
.map(t -> buildDirectory(basePath, t))
|
||||
.collect(Collectors.toList()))
|
||||
.orElseThrow(() -> new IllegalStateException("empty set list"));
|
||||
} catch (ActionManagerException | ISLookUpException e) {
|
||||
throw new IllegalStateException("unable to query ActionSets info from the IS");
|
||||
}
|
||||
}
|
||||
|
||||
private ActionManagerSet getActionManagerSet(final String basePath, final String profile)
|
||||
throws ActionManagerException {
|
||||
final SAXReader reader = new SAXReader();
|
||||
final ActionManagerSet set = new ActionManagerSet();
|
||||
|
||||
private Triple<String, String, String> parseSetInfo(String set) {
|
||||
try {
|
||||
final Document doc = reader.read(new StringReader(profile));
|
||||
|
||||
set.setId(doc.valueOf("//SET/@id").trim());
|
||||
set.setName(doc.valueOf("//SET").trim());
|
||||
set.setImpact(ImpactTypes.valueOf(doc.valueOf("//IMPACT").trim()));
|
||||
set
|
||||
.setLatest(
|
||||
doc.valueOf("//RAW_SETS/LATEST/@id"),
|
||||
doc.valueOf("//RAW_SETS/LATEST/@creationDate"),
|
||||
doc.valueOf("//RAW_SETS/LATEST/@lastUpdate"));
|
||||
set.setDirectory(doc.valueOf("//SET/@directory"));
|
||||
final List expiredNodes = doc.selectNodes("//RAW_SETS/EXPIRED");
|
||||
if (expiredNodes != null) {
|
||||
for (int i = 0; i < expiredNodes.size(); i++) {
|
||||
Element ex = (Element) expiredNodes.get(i);
|
||||
set
|
||||
.addExpired(
|
||||
ex.attributeValue("id"),
|
||||
ex.attributeValue("creationDate"),
|
||||
ex.attributeValue("lastUpdate"));
|
||||
}
|
||||
}
|
||||
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
sb.append(basePath);
|
||||
sb.append("/");
|
||||
sb.append(doc.valueOf("//SET/@directory"));
|
||||
sb.append("/");
|
||||
sb.append(doc.valueOf("//RAW_SETS/LATEST/@id"));
|
||||
set.setPathToLatest(sb.toString());
|
||||
|
||||
return set;
|
||||
} catch (Exception e) {
|
||||
throw new ActionManagerException("Error creating set from profile: " + profile, e);
|
||||
Document doc = new SAXReader().read(new StringReader(set));
|
||||
return Triple
|
||||
.of(
|
||||
doc.valueOf("//SET/@id"),
|
||||
doc.valueOf("//SET/@directory"),
|
||||
doc.valueOf("//SET/@latest"));
|
||||
} catch (DocumentException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private String buildDirectory(String basePath, Triple<String, String, String> t) {
|
||||
return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight());
|
||||
}
|
||||
|
||||
private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException {
|
||||
return queryServiceProperty(isLookup, "basePath");
|
||||
}
|
||||
|
|
|
@ -160,9 +160,9 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
|
||||
private static String extractPayload(Row value) {
|
||||
try {
|
||||
return value.<String> getAs("payload");
|
||||
return value.getAs("payload");
|
||||
} catch (IllegalArgumentException | ClassCastException e) {
|
||||
logger.error("cannot extract payload from action: {}", value.toString());
|
||||
logger.error("cannot extract payload from action: {}", value);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -75,7 +75,6 @@ public class CollectAndSave implements Serializable {
|
|||
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
|
||||
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
;
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
|
|
|
@ -36,7 +36,7 @@ import scala.Tuple2;
|
|||
*/
|
||||
public class SparkAtomicActionScoreJob implements Serializable {
|
||||
|
||||
private static String DOI = "doi";
|
||||
private static final String DOI = "doi";
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ import org.json4s.jackson.JsonMethods.parse
|
|||
import java.nio.charset.CodingErrorAction
|
||||
import java.text.SimpleDateFormat
|
||||
import java.time.LocalDate
|
||||
import java.time.chrono.ThaiBuddhistDate
|
||||
import java.time.format.DateTimeFormatter
|
||||
import java.util.{Date, Locale}
|
||||
import java.util.regex.Pattern
|
||||
|
@ -164,6 +165,16 @@ object DataciteToOAFTransformation {
|
|||
d
|
||||
}
|
||||
|
||||
def fix_thai_date(input:String, format:String) :String = {
|
||||
try {
|
||||
val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format))
|
||||
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
|
||||
LocalDate.from(d).toString
|
||||
} catch {
|
||||
case _: Throwable => ""
|
||||
}
|
||||
}
|
||||
|
||||
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
|
||||
if (resourceType != null && resourceType.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
|
||||
|
@ -377,17 +388,31 @@ object DataciteToOAFTransformation {
|
|||
.map(d => d.get)
|
||||
|
||||
if (a_date.isDefined) {
|
||||
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
||||
if(doi.startsWith("10.14457"))
|
||||
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null))
|
||||
else
|
||||
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
||||
}
|
||||
if (i_date.isDefined && i_date.get.isDefined) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
if(doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
|
||||
}
|
||||
else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
}
|
||||
}
|
||||
else if (publication_year != null) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
}
|
||||
if(doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
|
||||
|
||||
} else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
}
|
||||
}
|
||||
|
||||
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.map(d => (extract_date(d.date.get), d.dateType.get))
|
||||
|
@ -468,11 +493,11 @@ object DataciteToOAFTransformation {
|
|||
JField("awardUri", JString(awardUri)) <- fundingReferences
|
||||
} yield awardUri
|
||||
|
||||
val relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
||||
fix_figshare(result)
|
||||
result.setId(IdentifierFactory.createIdentifier(result))
|
||||
if (result.getId == null)
|
||||
return List()
|
||||
val relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
||||
fix_figshare(result)
|
||||
if (relations != null && relations.nonEmpty) {
|
||||
List(result) ::: relations
|
||||
}
|
||||
|
|
|
@ -56,6 +56,7 @@ object ImportDatacite {
|
|||
val hdfsTargetPath = new Path(targetPath)
|
||||
log.info(s"hdfsTargetPath is $hdfsTargetPath")
|
||||
|
||||
val bs = if (parser.get("blocksize") == null) 100 else parser.get("blocksize").toInt
|
||||
|
||||
val spkipImport = parser.get("skipImport")
|
||||
log.info(s"skipImport is $spkipImport")
|
||||
|
@ -110,7 +111,7 @@ object ImportDatacite {
|
|||
|
||||
println(s"last Timestamp is $ts")
|
||||
|
||||
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf)
|
||||
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
|
||||
|
||||
println(s"Imported from Datacite API $cnt documents")
|
||||
|
||||
|
@ -137,7 +138,7 @@ object ImportDatacite {
|
|||
}
|
||||
}
|
||||
|
||||
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration): Long = {
|
||||
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = {
|
||||
var from:Long = timestamp * 1000
|
||||
val delta:Long = 50000000L
|
||||
var client: DataciteAPIImporter = null
|
||||
|
@ -148,7 +149,7 @@ object ImportDatacite {
|
|||
try {
|
||||
var start: Long = System.currentTimeMillis
|
||||
while (from < now) {
|
||||
client = new DataciteAPIImporter(from, 100, from + delta)
|
||||
client = new DataciteAPIImporter(from, bs, from + delta)
|
||||
var end: Long = 0
|
||||
val key: IntWritable = new IntWritable(i)
|
||||
val value: Text = new Text
|
||||
|
|
|
@ -143,24 +143,8 @@ public class PrepareProgramme {
|
|||
|
||||
JavaRDD<CSVProgramme> h2020Programmes = programme
|
||||
.toJavaRDD()
|
||||
.filter(p -> p.getFrameworkProgramme().trim().equalsIgnoreCase("H2020"))
|
||||
.mapToPair(csvProgramme -> new Tuple2<>(csvProgramme.getCode(), csvProgramme))
|
||||
.reduceByKey((a, b) -> {
|
||||
if (!a.getLanguage().equals("en")) {
|
||||
if (b.getLanguage().equalsIgnoreCase("en")) {
|
||||
a.setTitle(b.getTitle());
|
||||
a.setLanguage(b.getLanguage());
|
||||
}
|
||||
}
|
||||
if (StringUtils.isEmpty(a.getShortTitle())) {
|
||||
if (!StringUtils.isEmpty(b.getShortTitle())) {
|
||||
a.setShortTitle(b.getShortTitle());
|
||||
}
|
||||
}
|
||||
|
||||
return a;
|
||||
|
||||
})
|
||||
.reduceByKey(PrepareProgramme::groupProgrammeByCode)
|
||||
.map(p -> {
|
||||
CSVProgramme csvProgramme = p._2();
|
||||
String programmeTitle = csvProgramme.getTitle().trim();
|
||||
|
@ -177,20 +161,31 @@ public class PrepareProgramme {
|
|||
return csvProgramme;
|
||||
});
|
||||
|
||||
// prepareClassification(h2020Programmes);
|
||||
|
||||
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
|
||||
final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<CSVProgramme> rdd = jsc.parallelize(prepareClassification(h2020Programmes), 1);
|
||||
rdd
|
||||
.map(csvProgramme -> {
|
||||
String tmp = OBJECT_MAPPER.writeValueAsString(csvProgramme);
|
||||
return tmp;
|
||||
})
|
||||
.map(OBJECT_MAPPER::writeValueAsString)
|
||||
.saveAsTextFile(outputPath);
|
||||
|
||||
}
|
||||
|
||||
private static CSVProgramme groupProgrammeByCode(CSVProgramme a, CSVProgramme b) {
|
||||
if (!a.getLanguage().equals("en")) {
|
||||
if (b.getLanguage().equalsIgnoreCase("en")) {
|
||||
a.setTitle(b.getTitle());
|
||||
a.setLanguage(b.getLanguage());
|
||||
}
|
||||
}
|
||||
if (StringUtils.isEmpty(a.getShortTitle())) {
|
||||
if (!StringUtils.isEmpty(b.getShortTitle())) {
|
||||
a.setShortTitle(b.getShortTitle());
|
||||
}
|
||||
}
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
private static List<CSVProgramme> prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
|
||||
Object[] codedescription = h2020Programmes
|
||||
.map(
|
||||
|
@ -241,15 +236,15 @@ public class PrepareProgramme {
|
|||
if (!ent.contains("Euratom")) {
|
||||
|
||||
String parent;
|
||||
String tmp_key = tmp[0] + ".";
|
||||
String tmpKey = tmp[0] + ".";
|
||||
for (int i = 1; i < tmp.length - 1; i++) {
|
||||
tmp_key += tmp[i] + ".";
|
||||
parent = map.get(tmp_key)._1().toLowerCase().trim();
|
||||
tmpKey += tmp[i] + ".";
|
||||
parent = map.get(tmpKey)._1().toLowerCase().trim();
|
||||
if (parent.contains("|")) {
|
||||
parent = parent.substring(parent.lastIndexOf("|") + 1).trim();
|
||||
}
|
||||
if (current.trim().length() > parent.length()
|
||||
&& current.toLowerCase().trim().substring(0, parent.length()).equals(parent)) {
|
||||
&& current.toLowerCase().trim().startsWith(parent)) {
|
||||
current = current.substring(parent.length() + 1);
|
||||
if (current.trim().charAt(0) == '-' || current.trim().charAt(0) == '–') {
|
||||
current = current.trim().substring(1).trim();
|
||||
|
|
|
@ -18,7 +18,6 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme;
|
||||
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
|
@ -32,7 +31,6 @@ public class PrepareProjects {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private static final HashMap<String, CSVProgramme> programmeMap = new HashMap<>();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
|
@ -93,7 +91,7 @@ public class PrepareProjects {
|
|||
}
|
||||
|
||||
private static FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject> getTuple2CSVProjectFlatMapFunction() {
|
||||
return (FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject>) value -> {
|
||||
return value -> {
|
||||
Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
|
||||
List<CSVProject> csvProjectList = new ArrayList<>();
|
||||
if (csvProject.isPresent()) {
|
||||
|
|
|
@ -120,7 +120,6 @@ public class SparkAtomicActionJob {
|
|||
.map((MapFunction<Tuple2<CSVProject, CSVProgramme>, Project>) c -> {
|
||||
|
||||
CSVProject csvProject = c._1();
|
||||
Optional<CSVProgramme> ocsvProgramme = Optional.ofNullable(c._2());
|
||||
|
||||
return Optional
|
||||
.ofNullable(c._2())
|
||||
|
@ -135,9 +134,9 @@ public class SparkAtomicActionJob {
|
|||
H2020Programme pm = new H2020Programme();
|
||||
H2020Classification h2020classification = new H2020Classification();
|
||||
pm.setCode(csvProject.getProgramme());
|
||||
h2020classification.setClassification(ocsvProgramme.get().getClassification());
|
||||
h2020classification.setClassification(csvProgramme.getClassification());
|
||||
h2020classification.setH2020Programme(pm);
|
||||
setLevelsandProgramme(h2020classification, ocsvProgramme.get().getClassification_short());
|
||||
setLevelsandProgramme(h2020classification, csvProgramme.getClassification_short());
|
||||
// setProgramme(h2020classification, ocsvProgramme.get().getClassification());
|
||||
pp.setH2020classification(Arrays.asList(h2020classification));
|
||||
|
||||
|
@ -145,10 +144,11 @@ public class SparkAtomicActionJob {
|
|||
})
|
||||
.orElse(null);
|
||||
|
||||
}, Encoders.bean(Project.class));
|
||||
}, Encoders.bean(Project.class))
|
||||
.filter(Objects::nonNull);
|
||||
|
||||
aaproject
|
||||
.joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")))
|
||||
.joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")), "left")
|
||||
.map((MapFunction<Tuple2<Project, EXCELTopic>, Project>) p -> {
|
||||
Optional<EXCELTopic> op = Optional.ofNullable(p._2());
|
||||
Project rp = p._1();
|
||||
|
|
|
@ -7,14 +7,7 @@ import java.io.Serializable;
|
|||
* The model for the programme csv file
|
||||
*/
|
||||
public class CSVProgramme implements Serializable {
|
||||
private String parentProgramme;
|
||||
private String frameworkProgramme;
|
||||
private String startDate;
|
||||
private String endDate;
|
||||
private String objective;
|
||||
private String subjects;
|
||||
private String legalBasis;
|
||||
private String call;
|
||||
|
||||
private String rcn;
|
||||
private String code;
|
||||
|
||||
|
@ -80,67 +73,5 @@ public class CSVProgramme implements Serializable {
|
|||
this.language = language;
|
||||
}
|
||||
|
||||
public String getParentProgramme() {
|
||||
return parentProgramme;
|
||||
}
|
||||
|
||||
public void setParentProgramme(String parentProgramme) {
|
||||
this.parentProgramme = parentProgramme;
|
||||
}
|
||||
|
||||
public String getFrameworkProgramme() {
|
||||
return frameworkProgramme;
|
||||
}
|
||||
|
||||
public void setFrameworkProgramme(String frameworkProgramme) {
|
||||
this.frameworkProgramme = frameworkProgramme;
|
||||
}
|
||||
|
||||
public String getStartDate() {
|
||||
return startDate;
|
||||
}
|
||||
|
||||
public void setStartDate(String startDate) {
|
||||
this.startDate = startDate;
|
||||
}
|
||||
|
||||
public String getEndDate() {
|
||||
return endDate;
|
||||
}
|
||||
|
||||
public void setEndDate(String endDate) {
|
||||
this.endDate = endDate;
|
||||
}
|
||||
|
||||
public String getObjective() {
|
||||
return objective;
|
||||
}
|
||||
|
||||
public void setObjective(String objective) {
|
||||
this.objective = objective;
|
||||
}
|
||||
|
||||
public String getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
public void setSubjects(String subjects) {
|
||||
this.subjects = subjects;
|
||||
}
|
||||
|
||||
public String getLegalBasis() {
|
||||
return legalBasis;
|
||||
}
|
||||
|
||||
public void setLegalBasis(String legalBasis) {
|
||||
this.legalBasis = legalBasis;
|
||||
}
|
||||
|
||||
public String getCall() {
|
||||
return call;
|
||||
}
|
||||
|
||||
public void setCall(String call) {
|
||||
this.call = call;
|
||||
}
|
||||
//
|
||||
}
|
||||
|
|
|
@ -22,15 +22,18 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
|||
*/
|
||||
public class EXCELParser {
|
||||
|
||||
public <R> List<R> parse(InputStream file, String classForName)
|
||||
public <R> List<R> parse(InputStream file, String classForName, String sheetName)
|
||||
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException,
|
||||
InvalidFormatException {
|
||||
|
||||
// OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL));
|
||||
OPCPackage pkg = OPCPackage.open(file);
|
||||
XSSFWorkbook wb = new XSSFWorkbook(pkg);
|
||||
|
||||
XSSFSheet sheet = wb.getSheet("cordisref-H2020topics");
|
||||
XSSFSheet sheet = wb.getSheet(sheetName);
|
||||
|
||||
if (sheetName == null) {
|
||||
throw new RuntimeException("Sheet name " + sheetName + " not present in current file");
|
||||
}
|
||||
|
||||
List<R> ret = new ArrayList<>();
|
||||
|
||||
|
@ -49,12 +52,11 @@ public class EXCELParser {
|
|||
headers.add(dataFormatter.formatCellValue(cell));
|
||||
}
|
||||
} else {
|
||||
Class<?> clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic");
|
||||
Class<?> clazz = Class.forName(classForName);
|
||||
final Object cc = clazz.newInstance();
|
||||
|
||||
for (int i = 0; i < headers.size(); i++) {
|
||||
Cell cell = row.getCell(i);
|
||||
String value = dataFormatter.formatCellValue(cell);
|
||||
FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true);
|
||||
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ public class ReadCSV implements Closeable {
|
|||
private final Configuration conf;
|
||||
private final BufferedWriter writer;
|
||||
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private String csvFile;
|
||||
private final String csvFile;
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
|
@ -85,7 +85,6 @@ public class ReadCSV implements Closeable {
|
|||
|
||||
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||
this.csvFile = httpConnector.getInputSource(fileURL);
|
||||
;
|
||||
}
|
||||
|
||||
protected void write(final Object p) {
|
||||
|
|
|
@ -25,7 +25,7 @@ public class ReadExcel implements Closeable {
|
|||
private final Configuration conf;
|
||||
private final BufferedWriter writer;
|
||||
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private InputStream excelFile;
|
||||
private final InputStream excelFile;
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
|
@ -41,19 +41,20 @@ public class ReadExcel implements Closeable {
|
|||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
final String classForName = parser.get("classForName");
|
||||
final String sheetName = parser.get("sheetName");
|
||||
|
||||
try (final ReadExcel readExcel = new ReadExcel(hdfsPath, hdfsNameNode, fileURL)) {
|
||||
|
||||
log.info("Getting Excel file...");
|
||||
readExcel.execute(classForName);
|
||||
readExcel.execute(classForName, sheetName);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public void execute(final String classForName) throws Exception {
|
||||
public void execute(final String classForName, final String sheetName) throws Exception {
|
||||
EXCELParser excelParser = new EXCELParser();
|
||||
excelParser
|
||||
.parse(excelFile, classForName)
|
||||
.parse(excelFile, classForName, sheetName)
|
||||
.stream()
|
||||
.forEach(p -> write(p));
|
||||
|
||||
|
@ -82,7 +83,6 @@ public class ReadExcel implements Closeable {
|
|||
|
||||
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||
this.excelFile = httpConnector.getInputSourceAsStream(fileURL);
|
||||
;
|
||||
}
|
||||
|
||||
protected void write(final Object p) {
|
||||
|
|
|
@ -0,0 +1,215 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob;
|
||||
import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType;
|
||||
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateRorActionSetJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateRorActionSetJob.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final String ROR_NS_PREFIX = "ror_________";
|
||||
|
||||
private static final List<KeyValue> ROR_COLLECTED_FROM = listKeyValues(
|
||||
"10|openaire____::993a7ae7a863813cf95028b50708e222", "ROR");
|
||||
|
||||
private static final DataInfo ROR_DATA_INFO = dataInfo(
|
||||
false, "", false, false, ENTITYREGISTRY_PROVENANCE_ACTION, "0.92");
|
||||
|
||||
private static final Qualifier ROR_PID_TYPE = qualifier(
|
||||
"ROR", "ROR", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkAtomicActionJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
processRorOrganizations(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void removeOutputDir(final SparkSession spark, final String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
private static void processRorOrganizations(final SparkSession spark,
|
||||
final String inputPath,
|
||||
final String outputPath) throws Exception {
|
||||
|
||||
readInputPath(spark, inputPath)
|
||||
.map(
|
||||
(MapFunction<RorOrganization, Organization>) GenerateRorActionSetJob::convertRorOrg,
|
||||
Encoders.bean(Organization.class))
|
||||
.toJavaRDD()
|
||||
.map(o -> new AtomicAction<>(Organization.class, o))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
}
|
||||
|
||||
protected static Organization convertRorOrg(final RorOrganization r) {
|
||||
|
||||
final Date now = new Date();
|
||||
|
||||
final Organization o = new Organization();
|
||||
|
||||
o.setId(String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(r.getId())));
|
||||
o.setOriginalId(Arrays.asList(String.format("%s::%s", ROR_NS_PREFIX, r.getId())));
|
||||
o.setCollectedfrom(ROR_COLLECTED_FROM);
|
||||
o.setPid(pids(r));
|
||||
o.setDateofcollection(now.toString());
|
||||
o.setDateoftransformation(now.toString());
|
||||
o.setExtraInfo(new ArrayList<>()); // Values not present in the file
|
||||
o.setOaiprovenance(null); // Values not present in the file
|
||||
o.setLegalshortname(field(r.getAcronyms().stream().findFirst().orElse(r.getName()), ROR_DATA_INFO));
|
||||
o.setLegalname(field(r.getName(), ROR_DATA_INFO));
|
||||
o.setAlternativeNames(alternativeNames(r));
|
||||
o.setWebsiteurl(field(r.getLinks().stream().findFirst().orElse(null), ROR_DATA_INFO));
|
||||
o.setLogourl(null);
|
||||
o.setEclegalbody(null);
|
||||
o.setEclegalperson(null);
|
||||
o.setEcnonprofit(null);
|
||||
o.setEcresearchorganization(null);
|
||||
o.setEchighereducation(null);
|
||||
o.setEcinternationalorganizationeurinterests(null);
|
||||
o.setEcinternationalorganization(null);
|
||||
o.setEcenterprise(null);
|
||||
o.setEcsmevalidated(null);
|
||||
o.setEcnutscode(null);
|
||||
if (r.getCountry() != null) {
|
||||
o
|
||||
.setCountry(
|
||||
qualifier(
|
||||
r.getCountry().getCountryCode(), r
|
||||
.getCountry()
|
||||
.getCountryName(),
|
||||
ModelConstants.DNET_COUNTRY_TYPE, ModelConstants.DNET_COUNTRY_TYPE));
|
||||
} else {
|
||||
o.setCountry(null);
|
||||
}
|
||||
o.setDataInfo(ROR_DATA_INFO);
|
||||
o.setLastupdatetimestamp(now.getTime());
|
||||
|
||||
return o;
|
||||
}
|
||||
|
||||
private static List<StructuredProperty> pids(final RorOrganization r) {
|
||||
final List<StructuredProperty> pids = new ArrayList<>();
|
||||
pids.add(structuredProperty(r.getId(), ROR_PID_TYPE, ROR_DATA_INFO));
|
||||
|
||||
for (final Map.Entry<String, ExternalIdType> e : r.getExternalIds().entrySet()) {
|
||||
final String type = e.getKey();
|
||||
final List<String> all = e.getValue().getAll();
|
||||
if (all != null) {
|
||||
final Qualifier qualifier = qualifier(
|
||||
type, type, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES);
|
||||
for (final String pid : all) {
|
||||
pids
|
||||
.add(structuredProperty(pid, qualifier, ROR_DATA_INFO));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return pids;
|
||||
}
|
||||
|
||||
private static List<Field<String>> alternativeNames(final RorOrganization r) {
|
||||
final Set<String> names = new LinkedHashSet<>();
|
||||
names.addAll(r.getAliases());
|
||||
names.addAll(r.getAcronyms());
|
||||
r.getLabels().forEach(l -> names.add(l.getLabel()));
|
||||
|
||||
return names
|
||||
.stream()
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(s -> field(s, ROR_DATA_INFO))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static Dataset<RorOrganization> readInputPath(
|
||||
final SparkSession spark,
|
||||
final String path) throws Exception {
|
||||
|
||||
try (final FileSystem fileSystem = FileSystem.get(new Configuration());
|
||||
final InputStream is = fileSystem.open(new Path(path))) {
|
||||
final RorOrganization[] arr = OBJECT_MAPPER.readValue(is, RorOrganization[].class);
|
||||
return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,122 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class Address implements Serializable {
|
||||
|
||||
@JsonProperty("lat")
|
||||
private Float lat;
|
||||
|
||||
@JsonProperty("state_code")
|
||||
private String stateCode;
|
||||
|
||||
@JsonProperty("country_geonames_id")
|
||||
private Integer countryGeonamesId;
|
||||
|
||||
@JsonProperty("lng")
|
||||
private Float lng;
|
||||
|
||||
@JsonProperty("state")
|
||||
private String state;
|
||||
|
||||
@JsonProperty("city")
|
||||
private String city;
|
||||
|
||||
@JsonProperty("geonames_city")
|
||||
private GeonamesCity geonamesCity;
|
||||
|
||||
@JsonProperty("postcode")
|
||||
private String postcode;
|
||||
|
||||
@JsonProperty("primary")
|
||||
private Boolean primary;
|
||||
|
||||
@JsonProperty("line")
|
||||
private String line;
|
||||
|
||||
private final static long serialVersionUID = 2444635485253443195L;
|
||||
|
||||
public Float getLat() {
|
||||
return lat;
|
||||
}
|
||||
|
||||
public void setLat(final Float lat) {
|
||||
this.lat = lat;
|
||||
}
|
||||
|
||||
public String getStateCode() {
|
||||
return stateCode;
|
||||
}
|
||||
|
||||
public void setStateCode(final String stateCode) {
|
||||
this.stateCode = stateCode;
|
||||
}
|
||||
|
||||
public Integer getCountryGeonamesId() {
|
||||
return countryGeonamesId;
|
||||
}
|
||||
|
||||
public void setCountryGeonamesId(final Integer countryGeonamesId) {
|
||||
this.countryGeonamesId = countryGeonamesId;
|
||||
}
|
||||
|
||||
public Float getLng() {
|
||||
return lng;
|
||||
}
|
||||
|
||||
public void setLng(final Float lng) {
|
||||
this.lng = lng;
|
||||
}
|
||||
|
||||
public String getState() {
|
||||
return state;
|
||||
}
|
||||
|
||||
public void setState(final String state) {
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
public String getCity() {
|
||||
return city;
|
||||
}
|
||||
|
||||
public void setCity(final String city) {
|
||||
this.city = city;
|
||||
}
|
||||
|
||||
public GeonamesCity getGeonamesCity() {
|
||||
return geonamesCity;
|
||||
}
|
||||
|
||||
public void setGeonamesCity(final GeonamesCity geonamesCity) {
|
||||
this.geonamesCity = geonamesCity;
|
||||
}
|
||||
|
||||
public String getPostcode() {
|
||||
return postcode;
|
||||
}
|
||||
|
||||
public void setPostcode(final String postcode) {
|
||||
this.postcode = postcode;
|
||||
}
|
||||
|
||||
public Boolean getPrimary() {
|
||||
return primary;
|
||||
}
|
||||
|
||||
public void setPrimary(final Boolean primary) {
|
||||
this.primary = primary;
|
||||
}
|
||||
|
||||
public String getLine() {
|
||||
return line;
|
||||
}
|
||||
|
||||
public void setLine(final String line) {
|
||||
this.line = line;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class Country implements Serializable {
|
||||
|
||||
@JsonProperty("country_code")
|
||||
private String countryCode;
|
||||
|
||||
@JsonProperty("country_name")
|
||||
private String countryName;
|
||||
|
||||
private final static long serialVersionUID = 4357848706229493627L;
|
||||
|
||||
public String getCountryCode() {
|
||||
return countryCode;
|
||||
}
|
||||
|
||||
public void setCountryCode(final String countryCode) {
|
||||
this.countryCode = countryCode;
|
||||
}
|
||||
|
||||
public String getCountryName() {
|
||||
return countryName;
|
||||
}
|
||||
|
||||
public void setCountryName(final String countryName) {
|
||||
this.countryName = countryName;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||||
|
||||
@JsonDeserialize(using = ExternalIdTypeDeserializer.class)
|
||||
public class ExternalIdType implements Serializable {
|
||||
|
||||
private List<String> all;
|
||||
|
||||
private String preferred;
|
||||
|
||||
private final static long serialVersionUID = 2616688352998387611L;
|
||||
|
||||
public ExternalIdType() {
|
||||
}
|
||||
|
||||
public ExternalIdType(final List<String> all, final String preferred) {
|
||||
this.all = all;
|
||||
this.preferred = preferred;
|
||||
}
|
||||
|
||||
public List<String> getAll() {
|
||||
return all;
|
||||
}
|
||||
|
||||
public void setAll(final List<String> all) {
|
||||
this.all = all;
|
||||
}
|
||||
|
||||
public String getPreferred() {
|
||||
return preferred;
|
||||
}
|
||||
|
||||
public void setPreferred(final String preferred) {
|
||||
this.preferred = preferred;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonParser;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.core.ObjectCodec;
|
||||
import com.fasterxml.jackson.databind.DeserializationContext;
|
||||
import com.fasterxml.jackson.databind.JsonDeserializer;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
|
||||
public class ExternalIdTypeDeserializer extends JsonDeserializer<ExternalIdType> {
|
||||
|
||||
@Override
|
||||
public ExternalIdType deserialize(final JsonParser p, final DeserializationContext ctxt)
|
||||
throws IOException, JsonProcessingException {
|
||||
final ObjectCodec oc = p.getCodec();
|
||||
final JsonNode node = oc.readTree(p);
|
||||
|
||||
final JsonNode allNode = node.get("all");
|
||||
|
||||
final String preferred = node.get("preferred").asText();
|
||||
|
||||
final List<String> all = new ArrayList<>();
|
||||
|
||||
if (allNode.isArray()) {
|
||||
allNode.elements().forEachRemaining(x -> all.add(x.asText()));
|
||||
} else {
|
||||
all.add(allNode.asText());
|
||||
}
|
||||
|
||||
return new ExternalIdType(all, preferred);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class GeonamesAdmin implements Serializable {
|
||||
|
||||
@JsonProperty("ascii_name")
|
||||
private String asciiName;
|
||||
|
||||
@JsonProperty("id")
|
||||
private Integer id;
|
||||
|
||||
@JsonProperty("name")
|
||||
private String name;
|
||||
|
||||
@JsonProperty("code")
|
||||
private String code;
|
||||
|
||||
private final static long serialVersionUID = 7294958526269195673L;
|
||||
|
||||
public String getAsciiName() {
|
||||
return asciiName;
|
||||
}
|
||||
|
||||
public void setAsciiName(final String asciiName) {
|
||||
this.asciiName = asciiName;
|
||||
}
|
||||
|
||||
public Integer getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final Integer id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public void setCode(final String code) {
|
||||
this.code = code;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,100 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class GeonamesCity implements Serializable {
|
||||
|
||||
@JsonProperty("geonames_admin1")
|
||||
private GeonamesAdmin geonamesAdmin1;
|
||||
|
||||
@JsonProperty("geonames_admin2")
|
||||
private GeonamesAdmin geonamesAdmin2;
|
||||
|
||||
@JsonProperty("city")
|
||||
private String city;
|
||||
|
||||
@JsonProperty("id")
|
||||
private Integer id;
|
||||
|
||||
@JsonProperty("nuts_level1")
|
||||
private NameAndCode nutsLevel1;
|
||||
|
||||
@JsonProperty("nuts_level2")
|
||||
private NameAndCode nutsLevel2;
|
||||
|
||||
@JsonProperty("nuts_level3")
|
||||
private NameAndCode nutsLevel3;
|
||||
|
||||
@JsonProperty("license")
|
||||
private License license;
|
||||
|
||||
private final static long serialVersionUID = -8389480201526252955L;
|
||||
|
||||
public NameAndCode getNutsLevel2() {
|
||||
return nutsLevel2;
|
||||
}
|
||||
|
||||
public void setNutsLevel2(final NameAndCode nutsLevel2) {
|
||||
this.nutsLevel2 = nutsLevel2;
|
||||
}
|
||||
|
||||
public GeonamesAdmin getGeonamesAdmin2() {
|
||||
return geonamesAdmin2;
|
||||
}
|
||||
|
||||
public void setGeonamesAdmin2(final GeonamesAdmin geonamesAdmin2) {
|
||||
this.geonamesAdmin2 = geonamesAdmin2;
|
||||
}
|
||||
|
||||
public GeonamesAdmin getGeonamesAdmin1() {
|
||||
return geonamesAdmin1;
|
||||
}
|
||||
|
||||
public void setGeonamesAdmin1(final GeonamesAdmin geonamesAdmin1) {
|
||||
this.geonamesAdmin1 = geonamesAdmin1;
|
||||
}
|
||||
|
||||
public String getCity() {
|
||||
return city;
|
||||
}
|
||||
|
||||
public void setCity(final String city) {
|
||||
this.city = city;
|
||||
}
|
||||
|
||||
public Integer getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final Integer id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public NameAndCode getNutsLevel1() {
|
||||
return nutsLevel1;
|
||||
}
|
||||
|
||||
public void setNutsLevel1(final NameAndCode nutsLevel1) {
|
||||
this.nutsLevel1 = nutsLevel1;
|
||||
}
|
||||
|
||||
public NameAndCode getNutsLevel3() {
|
||||
return nutsLevel3;
|
||||
}
|
||||
|
||||
public void setNutsLevel3(final NameAndCode nutsLevel3) {
|
||||
this.nutsLevel3 = nutsLevel3;
|
||||
}
|
||||
|
||||
public License getLicense() {
|
||||
return license;
|
||||
}
|
||||
|
||||
public void setLicense(final License license) {
|
||||
this.license = license;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class Label implements Serializable {
|
||||
|
||||
@JsonProperty("iso639")
|
||||
private String iso639;
|
||||
|
||||
@JsonProperty("label")
|
||||
private String label;
|
||||
|
||||
private final static long serialVersionUID = -6576156103297850809L;
|
||||
|
||||
public String getIso639() {
|
||||
return iso639;
|
||||
}
|
||||
|
||||
public void setIso639(final String iso639) {
|
||||
this.iso639 = iso639;
|
||||
}
|
||||
|
||||
public String getLabel() {
|
||||
return label;
|
||||
}
|
||||
|
||||
public void setLabel(final String label) {
|
||||
this.label = label;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class License implements Serializable {
|
||||
|
||||
@JsonProperty("attribution")
|
||||
private String attribution;
|
||||
|
||||
@JsonProperty("license")
|
||||
private String license;
|
||||
|
||||
private final static long serialVersionUID = -194308261058176439L;
|
||||
|
||||
public String getAttribution() {
|
||||
return attribution;
|
||||
}
|
||||
|
||||
public void setAttribution(final String attribution) {
|
||||
this.attribution = attribution;
|
||||
}
|
||||
|
||||
public String getLicense() {
|
||||
return license;
|
||||
}
|
||||
|
||||
public void setLicense(final String license) {
|
||||
this.license = license;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class NameAndCode implements Serializable {
|
||||
|
||||
@JsonProperty("name")
|
||||
private String name;
|
||||
|
||||
@JsonProperty("code")
|
||||
private String code;
|
||||
|
||||
private final static long serialVersionUID = 5459836979206140843L;
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public void setCode(final String code) {
|
||||
this.code = code;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class Relationship implements Serializable {
|
||||
|
||||
@JsonProperty("type")
|
||||
private String type;
|
||||
|
||||
@JsonProperty("id")
|
||||
private String id;
|
||||
|
||||
@JsonProperty("label")
|
||||
private String label;
|
||||
|
||||
private final static long serialVersionUID = 7847399503395576960L;
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(final String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getLabel() {
|
||||
return label;
|
||||
}
|
||||
|
||||
public void setLabel(final String label) {
|
||||
this.label = label;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,192 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class RorOrganization implements Serializable {
|
||||
|
||||
@JsonProperty("ip_addresses")
|
||||
private List<String> ipAddresses = new ArrayList<>();
|
||||
|
||||
@JsonProperty("aliases")
|
||||
private List<String> aliases = new ArrayList<>();
|
||||
|
||||
@JsonProperty("acronyms")
|
||||
private List<String> acronyms = new ArrayList<>();
|
||||
|
||||
@JsonProperty("links")
|
||||
private List<String> links = new ArrayList<>();
|
||||
|
||||
@JsonProperty("country")
|
||||
private Country country;
|
||||
|
||||
@JsonProperty("name")
|
||||
private String name;
|
||||
|
||||
@JsonProperty("wikipedia_url")
|
||||
private String wikipediaUrl;
|
||||
|
||||
@JsonProperty("addresses")
|
||||
private List<Address> addresses = new ArrayList<>();
|
||||
|
||||
@JsonProperty("types")
|
||||
private List<String> types = new ArrayList<>();
|
||||
|
||||
@JsonProperty("established")
|
||||
private Integer established;
|
||||
|
||||
@JsonProperty("relationships")
|
||||
private List<Relationship> relationships = new ArrayList<>();
|
||||
|
||||
@JsonProperty("email_address")
|
||||
private String emailAddress;
|
||||
|
||||
@JsonProperty("external_ids")
|
||||
private Map<String, ExternalIdType> externalIds = new LinkedHashMap<>();
|
||||
|
||||
@JsonProperty("id")
|
||||
private String id;
|
||||
|
||||
@JsonProperty("labels")
|
||||
private List<Label> labels = new ArrayList<>();
|
||||
|
||||
@JsonProperty("status")
|
||||
private String status;
|
||||
|
||||
private final static long serialVersionUID = -2658312087616043225L;
|
||||
|
||||
public List<String> getIpAddresses() {
|
||||
return ipAddresses;
|
||||
}
|
||||
|
||||
public void setIpAddresses(final List<String> ipAddresses) {
|
||||
this.ipAddresses = ipAddresses;
|
||||
}
|
||||
|
||||
public List<String> getAliases() {
|
||||
return aliases;
|
||||
}
|
||||
|
||||
public void setAliases(final List<String> aliases) {
|
||||
this.aliases = aliases;
|
||||
}
|
||||
|
||||
public List<String> getAcronyms() {
|
||||
return acronyms;
|
||||
}
|
||||
|
||||
public void setAcronyms(final List<String> acronyms) {
|
||||
this.acronyms = acronyms;
|
||||
}
|
||||
|
||||
public List<String> getLinks() {
|
||||
return links;
|
||||
}
|
||||
|
||||
public void setLinks(final List<String> links) {
|
||||
this.links = links;
|
||||
}
|
||||
|
||||
public Country getCountry() {
|
||||
return country;
|
||||
}
|
||||
|
||||
public void setCountry(final Country country) {
|
||||
this.country = country;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getWikipediaUrl() {
|
||||
return wikipediaUrl;
|
||||
}
|
||||
|
||||
public void setWikipediaUrl(final String wikipediaUrl) {
|
||||
this.wikipediaUrl = wikipediaUrl;
|
||||
}
|
||||
|
||||
public List<Address> getAddresses() {
|
||||
return addresses;
|
||||
}
|
||||
|
||||
public void setAddresses(final List<Address> addresses) {
|
||||
this.addresses = addresses;
|
||||
}
|
||||
|
||||
public List<String> getTypes() {
|
||||
return types;
|
||||
}
|
||||
|
||||
public void setTypes(final List<String> types) {
|
||||
this.types = types;
|
||||
}
|
||||
|
||||
public Integer getEstablished() {
|
||||
return established;
|
||||
}
|
||||
|
||||
public void setEstablished(final Integer established) {
|
||||
this.established = established;
|
||||
}
|
||||
|
||||
public List<Relationship> getRelationships() {
|
||||
return relationships;
|
||||
}
|
||||
|
||||
public void setRelationships(final List<Relationship> relationships) {
|
||||
this.relationships = relationships;
|
||||
}
|
||||
|
||||
public String getEmailAddress() {
|
||||
return emailAddress;
|
||||
}
|
||||
|
||||
public void setEmailAddress(final String emailAddress) {
|
||||
this.emailAddress = emailAddress;
|
||||
}
|
||||
|
||||
public Map<String, ExternalIdType> getExternalIds() {
|
||||
return externalIds;
|
||||
}
|
||||
|
||||
public void setExternalIds(final Map<String, ExternalIdType> externalIds) {
|
||||
this.externalIds = externalIds;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public List<Label> getLabels() {
|
||||
return labels;
|
||||
}
|
||||
|
||||
public void setLabels(final List<Label> labels) {
|
||||
this.labels = labels;
|
||||
}
|
||||
|
||||
public String getStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
public void setStatus(final String status) {
|
||||
this.status = status;
|
||||
}
|
||||
|
||||
}
|
|
@ -18,7 +18,7 @@ public abstract class ReportingJob {
|
|||
*/
|
||||
public static final int INITIAL_DELAY = 2;
|
||||
|
||||
private ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
|
||||
private final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
|
||||
|
||||
protected final AggregatorReport report;
|
||||
|
||||
|
|
|
@ -14,9 +14,9 @@ import org.apache.hadoop.fs.Path;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.rest.DNetRestClient;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
|
||||
|
||||
public class MDStoreActionNode {
|
||||
private static final Logger log = LoggerFactory.getLogger(MDStoreActionNode.class);
|
||||
|
|
|
@ -16,7 +16,6 @@ import org.apache.hadoop.io.compress.DeflateCodec;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
|
||||
import eu.dnetlib.dhp.aggregation.common.ReportingJob;
|
||||
|
@ -25,6 +24,7 @@ import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
|
|||
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
|
||||
|
||||
public class CollectorWorker extends ReportingJob {
|
||||
|
||||
|
|
|
@ -13,10 +13,10 @@ import org.apache.hadoop.fs.FileSystem;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.message.MessageSender;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
|
||||
|
||||
/**
|
||||
* CollectorWorkerApplication is the main class responsible to start the metadata collection process, storing the outcomes
|
||||
|
@ -30,7 +30,7 @@ public class CollectorWorkerApplication {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class);
|
||||
|
||||
private FileSystem fileSystem;
|
||||
private final FileSystem fileSystem;
|
||||
|
||||
public CollectorWorkerApplication(FileSystem fileSystem) {
|
||||
this.fileSystem = fileSystem;
|
||||
|
|
|
@ -28,8 +28,8 @@ import org.dom4j.io.SAXReader;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.schema.mdstore.Provenance;
|
||||
import scala.Tuple2;
|
||||
|
|
|
@ -32,7 +32,7 @@ public class HttpConnector2 {
|
|||
|
||||
private String responseType = null;
|
||||
|
||||
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
||||
private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
||||
|
||||
public HttpConnector2() {
|
||||
this(new HttpClientParams());
|
||||
|
@ -131,20 +131,12 @@ public class HttpConnector2 {
|
|||
}
|
||||
return attemptDownload(newUrl, retryNumber + 1, report);
|
||||
}
|
||||
if (is4xx(urlConn.getResponseCode())) {
|
||||
// CLIENT ERROR, DO NOT RETRY
|
||||
report
|
||||
.put(
|
||||
REPORT_PREFIX + urlConn.getResponseCode(),
|
||||
String
|
||||
.format(
|
||||
"%s error: %s", requestUrl, urlConn.getResponseMessage()));
|
||||
throw new CollectorException("4xx error: request will not be repeated. " + report);
|
||||
}
|
||||
if (is5xx(urlConn.getResponseCode())) {
|
||||
// SERVER SIDE ERRORS RETRY ONLY on 503
|
||||
if (is4xx(urlConn.getResponseCode()) || is5xx(urlConn.getResponseCode())) {
|
||||
switch (urlConn.getResponseCode()) {
|
||||
case HttpURLConnection.HTTP_NOT_FOUND:
|
||||
case HttpURLConnection.HTTP_BAD_GATEWAY:
|
||||
case HttpURLConnection.HTTP_UNAVAILABLE:
|
||||
case HttpURLConnection.HTTP_GATEWAY_TIMEOUT:
|
||||
if (retryAfter > 0) {
|
||||
log
|
||||
.warn(
|
||||
|
|
|
@ -11,6 +11,8 @@ import org.bson.Document;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.mongodb.MongoClient;
|
||||
import com.mongodb.MongoClientURI;
|
||||
import com.mongodb.client.MongoCollection;
|
||||
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
|
|
|
@ -21,6 +21,9 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
|||
|
||||
public class OaiCollectorPlugin implements CollectorPlugin {
|
||||
|
||||
public static final String DATE_REGEX = "\\d{4}-\\d{2}-\\d{2}";
|
||||
public static final String UTC_DATETIME_REGEX = "\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z";
|
||||
|
||||
private static final String FORMAT_PARAM = "format";
|
||||
private static final String OAI_SET_PARAM = "set";
|
||||
private static final Object OAI_FROM_DATE_PARAM = "fromDate";
|
||||
|
@ -62,11 +65,11 @@ public class OaiCollectorPlugin implements CollectorPlugin {
|
|||
throw new CollectorException("Param 'mdFormat' is null or empty");
|
||||
}
|
||||
|
||||
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||
if (fromDate != null && !fromDate.matches(DATE_REGEX) && !fromDate.matches(UTC_DATETIME_REGEX)) {
|
||||
throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
|
||||
}
|
||||
|
||||
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||
if (untilDate != null && !untilDate.matches(DATE_REGEX) && !untilDate.matches(UTC_DATETIME_REGEX)) {
|
||||
throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
|
||||
}
|
||||
|
||||
|
|
|
@ -42,7 +42,7 @@ public class OaiIterator implements Iterator<String> {
|
|||
private String token;
|
||||
private boolean started;
|
||||
private final HttpConnector2 httpConnector;
|
||||
private AggregatorReport report;
|
||||
private final AggregatorReport report;
|
||||
|
||||
public OaiIterator(
|
||||
final String baseUrl,
|
||||
|
@ -107,10 +107,12 @@ public class OaiIterator implements Iterator<String> {
|
|||
if (set != null && !set.isEmpty()) {
|
||||
url += "&set=" + URLEncoder.encode(set, "UTF-8");
|
||||
}
|
||||
if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||
if (fromDate != null && (fromDate.matches(OaiCollectorPlugin.DATE_REGEX)
|
||||
|| fromDate.matches(OaiCollectorPlugin.UTC_DATETIME_REGEX))) {
|
||||
url += "&from=" + URLEncoder.encode(fromDate, "UTF-8");
|
||||
}
|
||||
if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||
if (untilDate != null && (untilDate.matches(OaiCollectorPlugin.DATE_REGEX)
|
||||
|| untilDate.matches(OaiCollectorPlugin.UTC_DATETIME_REGEX))) {
|
||||
url += "&until=" + URLEncoder.encode(untilDate, "UTF-8");
|
||||
}
|
||||
log.info("Start harvesting using url: " + url);
|
||||
|
|
|
@ -26,7 +26,7 @@ public class RestCollectorPlugin implements CollectorPlugin {
|
|||
|
||||
public static final String RESULT_SIZE_VALUE_DEFAULT = "100";
|
||||
|
||||
private HttpClientParams clientParams;
|
||||
private final HttpClientParams clientParams;
|
||||
|
||||
public RestCollectorPlugin(HttpClientParams clientParams) {
|
||||
this.clientParams = clientParams;
|
||||
|
|
|
@ -48,18 +48,18 @@ public class RestIterator implements Iterator<String> {
|
|||
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
|
||||
public static final String UTF_8 = "UTF-8";
|
||||
|
||||
private HttpClientParams clientParams;
|
||||
private final HttpClientParams clientParams;
|
||||
|
||||
private final String BASIC = "basic";
|
||||
|
||||
private JsonUtils jsonUtils;
|
||||
private final JsonUtils jsonUtils;
|
||||
|
||||
private String baseUrl;
|
||||
private String resumptionType;
|
||||
private String resumptionParam;
|
||||
private String resultFormatValue;
|
||||
private final String baseUrl;
|
||||
private final String resumptionType;
|
||||
private final String resumptionParam;
|
||||
private final String resultFormatValue;
|
||||
private String queryParams;
|
||||
private int resultSizeValue;
|
||||
private final int resultSizeValue;
|
||||
private int resumptionInt = 0; // integer resumption token (first record to harvest)
|
||||
private int resultTotal = -1;
|
||||
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
|
||||
|
@ -71,11 +71,11 @@ public class RestIterator implements Iterator<String> {
|
|||
private XPathExpression xprResultTotalPath;
|
||||
private XPathExpression xprResumptionPath;
|
||||
private XPathExpression xprEntity;
|
||||
private String queryFormat;
|
||||
private String querySize;
|
||||
private String authMethod;
|
||||
private String authToken;
|
||||
private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
|
||||
private final String queryFormat;
|
||||
private final String querySize;
|
||||
private final String authMethod;
|
||||
private final String authToken;
|
||||
private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
|
||||
private int discoverResultSize = 0;
|
||||
private int pagination = 1;
|
||||
/*
|
||||
|
@ -83,7 +83,7 @@ public class RestIterator implements Iterator<String> {
|
|||
* json. useful for cases when the target API expects a resultFormatValue != json, but the results are returned in
|
||||
* json. An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format.
|
||||
*/
|
||||
private String resultOutputFormat;
|
||||
private final String resultOutputFormat;
|
||||
|
||||
/** RestIterator class
|
||||
* compatible to version 1.3.33
|
||||
|
@ -229,7 +229,7 @@ public class RestIterator implements Iterator<String> {
|
|||
|
||||
resultStream = theHttpInputStream;
|
||||
if ("json".equals(resultOutputFormat)) {
|
||||
resultJson = IOUtils.toString(resultStream, UTF_8);
|
||||
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
|
||||
resultXml = jsonUtils.convertToXML(resultJson);
|
||||
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
||||
}
|
||||
|
|
|
@ -22,14 +22,13 @@ import org.apache.spark.util.LongAccumulator;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.message.MessageSender;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
|
@ -37,7 +36,7 @@ public class TransformSparkJobNode {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(TransformSparkJobNode.class);
|
||||
|
||||
private static int RECORDS_PER_TASK = 200;
|
||||
private static final int RECORDS_PER_TASK = 200;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
|
|
|
@ -10,87 +10,11 @@ import java.util.*;
|
|||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||
import net.sf.saxon.s9api.*;
|
||||
|
||||
public class DateCleaner implements ExtensionFunction, Serializable {
|
||||
|
||||
private final static List<Pattern> dateRegex = Arrays
|
||||
.asList(
|
||||
// Y-M-D
|
||||
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
||||
// M-D-Y
|
||||
Pattern
|
||||
.compile(
|
||||
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
|
||||
Pattern.MULTILINE),
|
||||
// D-M-Y
|
||||
Pattern
|
||||
.compile(
|
||||
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
|
||||
Pattern.MULTILINE),
|
||||
// Y
|
||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE));
|
||||
|
||||
private final static Pattern incompleteDateRegex = Pattern
|
||||
.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE);
|
||||
|
||||
private final static List<DateTimeFormatter> dformats = Arrays
|
||||
.asList(
|
||||
DateTimeFormatter
|
||||
.ofPattern(
|
||||
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
|
||||
Locale.ENGLISH),
|
||||
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN));
|
||||
|
||||
public String clean(final String inputDate) {
|
||||
|
||||
Optional<String> cleanedDate = dateRegex
|
||||
.stream()
|
||||
.map(
|
||||
p -> {
|
||||
final Matcher matcher = p.matcher(inputDate);
|
||||
if (matcher.find())
|
||||
return matcher.group(0);
|
||||
else
|
||||
return null;
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.map(m -> {
|
||||
Optional<String> cleanDate = dformats
|
||||
.stream()
|
||||
.map(f -> {
|
||||
try {
|
||||
LocalDate parsedDate = LocalDate.parse(m, f);
|
||||
if (parsedDate != null)
|
||||
return parsedDate.toString();
|
||||
else
|
||||
return null;
|
||||
} catch (Throwable e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
)
|
||||
.filter(Objects::nonNull)
|
||||
.findAny();
|
||||
|
||||
return cleanDate.orElse(null);
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.findAny();
|
||||
|
||||
if (cleanedDate.isPresent())
|
||||
return cleanedDate.get();
|
||||
|
||||
final Matcher matcher = incompleteDateRegex.matcher(inputDate);
|
||||
if (matcher.find()) {
|
||||
final Integer year = Integer.parseInt(matcher.group(1));
|
||||
final Integer month = Integer.parseInt(matcher.group(4) == null ? "01" : matcher.group(4));
|
||||
return String.format("%d-%02d-01", year, month);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public QName getName() {
|
||||
return new QName(QNAME_BASE_URI + "/dateISO", "dateISO");
|
||||
|
@ -117,4 +41,9 @@ public class DateCleaner implements ExtensionFunction, Serializable {
|
|||
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
||||
return new XdmAtomicValue(clean(currentValue));
|
||||
}
|
||||
|
||||
// for backward compatibility with the existing unit tests
|
||||
public String clean(String date) {
|
||||
return GraphCleaningFunctions.cleanDate(date);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ public class PersonCleaner implements ExtensionFunction, Serializable {
|
|||
private List<String> surname = Lists.newArrayList();
|
||||
private List<String> fullname = Lists.newArrayList();
|
||||
|
||||
private static Set<String> particles = null;
|
||||
private static final Set<String> particles = null;
|
||||
|
||||
public PersonCleaner() {
|
||||
|
||||
|
|
|
@ -20,6 +20,10 @@ public class XSLTTransformationFunction implements MapFunction<MetadataRecord, M
|
|||
|
||||
public final static String QNAME_BASE_URI = "http://eu/dnetlib/transform";
|
||||
|
||||
private final static String DATASOURCE_ID_PARAM = "varDataSourceId";
|
||||
|
||||
private final static String DATASOURCE_NAME_PARAM = "varOfficialName";
|
||||
|
||||
private final AggregationCounter aggregationCounter;
|
||||
|
||||
private final String transformationRule;
|
||||
|
@ -48,11 +52,16 @@ public class XSLTTransformationFunction implements MapFunction<MetadataRecord, M
|
|||
aggregationCounter.getTotalItems().add(1);
|
||||
try {
|
||||
Processor processor = new Processor(false);
|
||||
|
||||
processor.registerExtensionFunction(cleanFunction);
|
||||
processor.registerExtensionFunction(new DateCleaner());
|
||||
processor.registerExtensionFunction(new PersonCleaner());
|
||||
|
||||
final XsltCompiler comp = processor.newXsltCompiler();
|
||||
QName datasourceIDParam = new QName(DATASOURCE_ID_PARAM);
|
||||
comp.setParameter(datasourceIDParam, new XdmAtomicValue(value.getProvenance().getDatasourceId()));
|
||||
QName datasourceNameParam = new QName(DATASOURCE_NAME_PARAM);
|
||||
comp.setParameter(datasourceNameParam, new XdmAtomicValue(value.getProvenance().getDatasourceName()));
|
||||
XsltExecutable xslt = comp
|
||||
.compile(new StreamSource(IOUtils.toInputStream(transformationRule, StandardCharsets.UTF_8)));
|
||||
XdmNode source = processor
|
||||
|
|
|
@ -18,6 +18,12 @@
|
|||
"paramDescription": "avoid to downlaod new items but apply the previous update",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "bs",
|
||||
"paramLongName": "blocksize",
|
||||
"paramDescription": "define the requests block size",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "namenode",
|
||||
|
|
|
@ -8,6 +8,12 @@
|
|||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>blocksize</name>
|
||||
<value>100</value>
|
||||
<description>The request block size</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="ImportDatacite"/>
|
||||
|
@ -37,6 +43,7 @@
|
|||
<arg>--dataciteDumpPath</arg><arg>${mainPath}/datacite_dump</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
<arg>--blocksize</arg><arg>${blocksize}</arg>
|
||||
</spark>
|
||||
<ok to="TransformJob"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="H2020Programme" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="H2020Classification" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>projectFileURL</name>
|
||||
|
@ -18,6 +18,10 @@
|
|||
<name>outputPath</name>
|
||||
<description>path where to store the action set</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sheetName</name>
|
||||
<description>the name of the sheet to read</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="deleteoutputpath"/>
|
||||
|
@ -31,10 +35,23 @@
|
|||
<delete path='${workingDir}'/>
|
||||
<mkdir path='${workingDir}'/>
|
||||
</fs>
|
||||
<ok to="get_project_file"/>
|
||||
<ok to="fork_get_info"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<fork name="fork_get_info">
|
||||
<path start="fork_get_projects"/>
|
||||
<path start="get_programme_file"/>
|
||||
<path start="get_topic_file"/>
|
||||
|
||||
</fork>
|
||||
|
||||
<fork name="fork_get_projects">
|
||||
<path start="get_project_file"/>
|
||||
<path start="read_projects"/>
|
||||
</fork>
|
||||
|
||||
<action name="get_project_file">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV</main-class>
|
||||
|
@ -43,7 +60,7 @@
|
|||
<arg>--hdfsPath</arg><arg>${workingDir}/projects</arg>
|
||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.CSVProject</arg>
|
||||
</java>
|
||||
<ok to="get_programme_file"/>
|
||||
<ok to="wait_projects"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -55,7 +72,7 @@
|
|||
<arg>--hdfsPath</arg><arg>${workingDir}/programme</arg>
|
||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme</arg>
|
||||
</java>
|
||||
<ok to="get_topic_file"/>
|
||||
<ok to="prepare_programme"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -65,9 +82,10 @@
|
|||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--fileURL</arg><arg>${topicFileURL}</arg>
|
||||
<arg>--hdfsPath</arg><arg>${workingDir}/topic</arg>
|
||||
<arg>--sheetName</arg><arg>${sheetName}</arg>
|
||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic</arg>
|
||||
</java>
|
||||
<ok to="read_projects"/>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -80,7 +98,7 @@
|
|||
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
||||
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="prepare_programme"/>
|
||||
<ok to="wait_projects"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -104,10 +122,15 @@
|
|||
<arg>--programmePath</arg><arg>${workingDir}/programme</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/preparedProgramme</arg>
|
||||
</spark>
|
||||
<ok to="prepare_project"/>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait" to="create_updates"/>
|
||||
|
||||
<join name="wait_projects" to="prepare_project"/>
|
||||
|
||||
|
||||
<action name="prepare_project">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -129,7 +152,7 @@
|
|||
<arg>--outputPath</arg><arg>${workingDir}/preparedProjects</arg>
|
||||
<arg>--dbProjectPath</arg><arg>${workingDir}/dbProjects</arg>
|
||||
</spark>
|
||||
<ok to="create_updates"/>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
|
|
@ -23,6 +23,11 @@
|
|||
"paramLongName" : "classForName",
|
||||
"paramDescription" : "the name of the class to deserialize the csv to",
|
||||
"paramRequired" : true
|
||||
}, {
|
||||
"paramName": "sn",
|
||||
"paramLongName" : "sheetName",
|
||||
"paramDescription" : "the name of the sheet in case the file is excel",
|
||||
"paramRequired" : false
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the path of the input json",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,58 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorNumber</name>
|
||||
<value>4</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<value>15G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<value>6G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<value>1</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,55 @@
|
|||
<workflow-app name="Update_ROR_action_set" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>rorJsonInputPath</name>
|
||||
<description>the path of the json</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>rorActionSetPath</name>
|
||||
<description>path where to store the action set</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="deleteoutputpath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="deleteoutputpath">
|
||||
<fs>
|
||||
<delete path='${rorActionSetPath}'/>
|
||||
<mkdir path='${rorActionSetPath}'/>
|
||||
<delete path='${workingDir}'/>
|
||||
<mkdir path='${workingDir}'/>
|
||||
</fs>
|
||||
<ok to="processRorFile"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="processRorFile">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ProcessRorFile</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.ror.GenerateRorActionSetJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${rorJsonInputPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${rorActionSetPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -21,7 +21,7 @@ public class EXCELParserTest {
|
|||
|
||||
private static Path workingDir;
|
||||
private HttpConnector2 httpConnector = new HttpConnector2();
|
||||
private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx";
|
||||
private static final String URL = "https://cordis.europa.eu/data/reference/cordisref-h2020topics.xlsx";
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
|
@ -36,9 +36,11 @@ public class EXCELParserTest {
|
|||
EXCELParser excelParser = new EXCELParser();
|
||||
|
||||
List<Object> pl = excelParser
|
||||
.parse(httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic");
|
||||
.parse(
|
||||
httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic",
|
||||
"Topics");
|
||||
|
||||
Assertions.assertEquals(3837, pl.size());
|
||||
Assertions.assertEquals(3878, pl.size());
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
|
||||
@Disabled
|
||||
class GenerateRorActionSetJobTest {
|
||||
|
||||
private static final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
private static final String local_file_path = "/Users/michele/Downloads/ror-data-2021-04-06.json";
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
}
|
||||
|
||||
@Test
|
||||
void testConvertRorOrg() throws Exception {
|
||||
final RorOrganization r = mapper
|
||||
.readValue(IOUtils.toString(getClass().getResourceAsStream("ror_org.json")), RorOrganization.class);
|
||||
final Organization org = GenerateRorActionSetJob.convertRorOrg(r);
|
||||
|
||||
System.out.println(mapper.writeValueAsString(org));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testConvertAllRorOrg() throws Exception {
|
||||
final RorOrganization[] arr = mapper
|
||||
.readValue(IOUtils.toString(new FileInputStream(local_file_path)), RorOrganization[].class);
|
||||
|
||||
for (final RorOrganization r : arr) {
|
||||
GenerateRorActionSetJob.convertRorOrg(r);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -18,7 +18,7 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
|
||||
|
||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
|
|
|
@ -36,8 +36,8 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.schema.mdstore.Provenance;
|
||||
import eu.dnetlib.dhp.transformation.TransformSparkJobNode;
|
||||
|
|
|
@ -25,22 +25,22 @@ public class RestCollectorPluginTest {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
|
||||
|
||||
private String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
|
||||
private String resumptionType = "count";
|
||||
private String resumptionParam = "from";
|
||||
private String entityXpath = "//hits/hits";
|
||||
private String resumptionXpath = "//hits";
|
||||
private String resultTotalXpath = "//hits/total";
|
||||
private String resultFormatParam = "format";
|
||||
private String resultFormatValue = "json";
|
||||
private String resultSizeParam = "size";
|
||||
private String resultSizeValue = "10";
|
||||
private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
|
||||
private final String resumptionType = "count";
|
||||
private final String resumptionParam = "from";
|
||||
private final String entityXpath = "//hits/hits";
|
||||
private final String resumptionXpath = "//hits";
|
||||
private final String resultTotalXpath = "//hits/total";
|
||||
private final String resultFormatParam = "format";
|
||||
private final String resultFormatValue = "json";
|
||||
private final String resultSizeParam = "size";
|
||||
private final String resultSizeValue = "10";
|
||||
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
|
||||
private String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
|
||||
private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
|
||||
// private String query = "=(sources:engrXiv AND type:preprint)";
|
||||
|
||||
private String protocolDescriptor = "rest_json2xml";
|
||||
private ApiDescriptor api = new ApiDescriptor();
|
||||
private final String protocolDescriptor = "rest_json2xml";
|
||||
private final ApiDescriptor api = new ApiDescriptor();
|
||||
private RestCollectorPlugin rcp;
|
||||
|
||||
@BeforeEach
|
||||
|
|
|
@ -20,20 +20,20 @@ public class RestIteratorTest {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(RestIteratorTest.class);
|
||||
|
||||
private String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
|
||||
private String resumptionType = "count";
|
||||
private String resumptionParam = "from";
|
||||
private String resumptionXpath = "";
|
||||
private String resultTotalXpath = "//hits/total";
|
||||
private String entityXpath = "//hits/hits";
|
||||
private String resultFormatParam = "format";
|
||||
private String resultFormatValue = "Json"; // Change from lowerCase to one UpperCase
|
||||
private String resultSizeParam = "size";
|
||||
private String resultSizeValue = "10";
|
||||
private String authMethod = "";
|
||||
private String authToken = "";
|
||||
private String resultOffsetParam = "cursor";
|
||||
private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
|
||||
private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
|
||||
private final String resumptionType = "count";
|
||||
private final String resumptionParam = "from";
|
||||
private final String resumptionXpath = "";
|
||||
private final String resultTotalXpath = "//hits/total";
|
||||
private final String entityXpath = "//hits/hits";
|
||||
private final String resultFormatParam = "format";
|
||||
private final String resultFormatValue = "Json"; // Change from lowerCase to one UpperCase
|
||||
private final String resultSizeParam = "size";
|
||||
private final String resultSizeValue = "10";
|
||||
private final String authMethod = "";
|
||||
private final String authToken = "";
|
||||
private final String resultOffsetParam = "cursor";
|
||||
private final String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.mockito.junit.jupiter.MockitoExtension;
|
|||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest;
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.schema.mdstore.Provenance;
|
||||
import eu.dnetlib.dhp.transformation.xslt.DateCleaner;
|
||||
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
|
@ -50,11 +51,11 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
|||
@DisplayName("Test Date cleaner")
|
||||
public void testDateCleaner() throws Exception {
|
||||
DateCleaner dc = new DateCleaner();
|
||||
assertEquals(dc.clean("20/09/1982"), "1982-09-20");
|
||||
assertEquals(dc.clean("20-09-2002"), "2002-09-20");
|
||||
assertEquals(dc.clean("2002-09-20"), "2002-09-20");
|
||||
assertEquals(dc.clean("2002-9"), "2002-09-01");
|
||||
assertEquals(dc.clean("2021"), "2021-01-01");
|
||||
assertEquals("1982-09-20", dc.clean("20/09/1982"));
|
||||
assertEquals("2002-09-20", dc.clean("20-09-2002"));
|
||||
assertEquals("2002-09-20", dc.clean("2002-09-20"));
|
||||
assertEquals("2002-09-01", dc.clean("2002-9"));
|
||||
assertEquals("2021-01-01", dc.clean("2021"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -63,6 +64,8 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
|||
|
||||
// We Set the input Record getting the XML from the classpath
|
||||
final MetadataRecord mr = new MetadataRecord();
|
||||
|
||||
mr.setProvenance(new Provenance("DSID", "DSNAME", "PREFIX"));
|
||||
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_zenodo.xml")));
|
||||
// We Load the XSLT transformation Rule from the classpath
|
||||
XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt");
|
||||
|
@ -80,6 +83,7 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
|||
|
||||
// We Set the input Record getting the XML from the classpath
|
||||
final MetadataRecord mr = new MetadataRecord();
|
||||
mr.setProvenance(new Provenance("DSID", "DSNAME", "PREFIX"));
|
||||
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml")));
|
||||
// We Load the XSLT transformation Rule from the classpath
|
||||
XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt");
|
||||
|
@ -101,6 +105,7 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
|||
|
||||
// We Set the input Record getting the XML from the classpath
|
||||
final MetadataRecord mr = new MetadataRecord();
|
||||
mr.setProvenance(new Provenance("DSID", "DSNAME", "PREFIX"));
|
||||
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml")));
|
||||
// We Load the XSLT transformation Rule from the classpath
|
||||
XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript);
|
||||
|
@ -121,6 +126,7 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
|||
|
||||
// We Set the input Record getting the XML from the classpath
|
||||
final MetadataRecord mr = new MetadataRecord();
|
||||
mr.setProvenance(new Provenance("DSID", "DSNAME", "PREFIX"));
|
||||
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_omicsdi.xml")));
|
||||
// We Load the XSLT transformation Rule from the classpath
|
||||
XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript);
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,123 @@
|
|||
{
|
||||
"ip_addresses": [],
|
||||
"aliases": [],
|
||||
"acronyms": [
|
||||
"ANU"
|
||||
],
|
||||
"links": [
|
||||
"http://www.anu.edu.au/"
|
||||
],
|
||||
"country": {
|
||||
"country_code": "AU",
|
||||
"country_name": "Australia"
|
||||
},
|
||||
"name": "Australian National University",
|
||||
"wikipedia_url": "http://en.wikipedia.org/wiki/Australian_National_University",
|
||||
"addresses": [
|
||||
{
|
||||
"lat": -35.2778,
|
||||
"state_code": "AU-ACT",
|
||||
"country_geonames_id": 2077456,
|
||||
"lng": 149.1205,
|
||||
"state": "Australian Capital Territory",
|
||||
"city": "Canberra",
|
||||
"geonames_city": {
|
||||
"nuts_level2": {
|
||||
"name": null,
|
||||
"code": null
|
||||
},
|
||||
"geonames_admin2": {
|
||||
"ascii_name": null,
|
||||
"id": null,
|
||||
"name": null,
|
||||
"code": null
|
||||
},
|
||||
"geonames_admin1": {
|
||||
"ascii_name": "ACT",
|
||||
"id": 2177478,
|
||||
"name": "ACT",
|
||||
"code": "AU.01"
|
||||
},
|
||||
"city": "Canberra",
|
||||
"id": 2172517,
|
||||
"nuts_level1": {
|
||||
"name": null,
|
||||
"code": null
|
||||
},
|
||||
"nuts_level3": {
|
||||
"name": null,
|
||||
"code": null
|
||||
},
|
||||
"license": {
|
||||
"attribution": "Data from geonames.org under a CC-BY 3.0 license",
|
||||
"license": "http://creativecommons.org/licenses/by/3.0/"
|
||||
}
|
||||
},
|
||||
"postcode": null,
|
||||
"primary": false,
|
||||
"line": null
|
||||
}
|
||||
],
|
||||
"types": [
|
||||
"Education"
|
||||
],
|
||||
"established": 1946,
|
||||
"relationships": [
|
||||
{
|
||||
"type": "Related",
|
||||
"id": "https://ror.org/041c7s516",
|
||||
"label": "Calvary Hospital"
|
||||
},
|
||||
{
|
||||
"type": "Related",
|
||||
"id": "https://ror.org/04h7nbn38",
|
||||
"label": "Canberra Hospital"
|
||||
},
|
||||
{
|
||||
"type": "Related",
|
||||
"id": "https://ror.org/030jpqj15",
|
||||
"label": "Goulburn Base Hospital"
|
||||
},
|
||||
{
|
||||
"type": "Child",
|
||||
"id": "https://ror.org/006a4jj40",
|
||||
"label": "Mount Stromlo Observatory"
|
||||
}
|
||||
],
|
||||
"email_address": null,
|
||||
"external_ids": {
|
||||
"Wikidata": {
|
||||
"all": [
|
||||
"Q127990"
|
||||
],
|
||||
"preferred": null
|
||||
},
|
||||
"OrgRef": {
|
||||
"all": [
|
||||
"285106"
|
||||
],
|
||||
"preferred": null
|
||||
},
|
||||
"ISNI": {
|
||||
"all": [
|
||||
"0000 0001 2180 7477"
|
||||
],
|
||||
"preferred": null
|
||||
},
|
||||
"FundRef": {
|
||||
"all": [
|
||||
"501100000995",
|
||||
"501100001151",
|
||||
"100009020"
|
||||
],
|
||||
"preferred": "501100000995"
|
||||
},
|
||||
"GRID": {
|
||||
"all": "grid.1001.0",
|
||||
"preferred": "grid.1001.0"
|
||||
}
|
||||
},
|
||||
"id": "https://ror.org/019wvm592",
|
||||
"labels": [],
|
||||
"status": "active"
|
||||
}
|
|
@ -0,0 +1,143 @@
|
|||
// from PROD 2021-07-06 , tf script of HAL with around 3mill. records
|
||||
declare_script "dc_cleaning_OpenAIREplus_compliant_hal";
|
||||
declare_ns oaf = "http://namespace.openaire.eu/oaf";
|
||||
declare_ns dri = "http://www.driver-repository.eu/namespace/dri";
|
||||
declare_ns dr = "http://www.driver-repository.eu/namespace/dr";
|
||||
declare_ns dc = "http://purl.org/dc/elements/1.1/";
|
||||
declare_ns prov = "http://www.openarchives.org/OAI/2.0/provenance";
|
||||
declare_ns oai = "http://www.openarchives.org/OAI/2.0/";
|
||||
declare_ns xs = "http://www.w3.org/2001/XMLSchema";
|
||||
$var0 = "''";
|
||||
$varFP7 = "'corda_______::'";
|
||||
$varH2020 = "'corda__h2020::'";
|
||||
$varDummy = "''";
|
||||
static $varDatasourceid = getValue(PROFILEFIELD, [xpath:"concat('collection(&apos;/db/DRIVER/RepositoryServiceResources&apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&quot;NamespacePrefix&quot;][value=&quot;', //oaf:datasourceprefix, '&quot;]]')", xpath:"//EXTRA_FIELDS/FIELD[key='OpenAireDataSourceId']/value"]);
|
||||
static $varRepoid = xpath:"//dri:repositoryId";
|
||||
static $varOfficialname = getValue(PROFILEFIELD, [xpath:"concat('collection(&apos;/db/DRIVER/RepositoryServiceResources&apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&quot;NamespacePrefix&quot;][value=&quot;', //oaf:datasourceprefix, '&quot;]]')", xpath:"//CONFIGURATION/OFFICIAL_NAME"]);
|
||||
dri:objIdentifier = xpath:"//dri:objIdentifier";
|
||||
dri:repositoryId = $varRepoid;
|
||||
dri:recordIdentifier = xpath:"//dri:recordIdentifier";
|
||||
//
|
||||
// communities - deactivated until received green light from DARIAH to mark community on prod also
|
||||
// $varCommunity = xpath:"//*[local-name()='setSpec'][starts-with(., 'collection:DARIAH')]/'dariah'";
|
||||
// concept should not appear with empty attribute id, i.e when there is no community - ugly, but seems to work (oaf:datasourceprefix = just any field available in all records)
|
||||
// oaf:concept = set(xpath:"//oaf:datasourceprefix[string-length($varCommunity) gt 0]/''", @id = $varCommunity;);
|
||||
//
|
||||
// apply xpath:"//dc:contributor[starts-with(., 'European Project')]" if xpath:"string-length(replace(., '.*(\d{6,6}).*', '$1')) = 6" oaf:projectid = xpath:"concat($var1, replace(., '.*(\d{6,6}).*', '$1'))"; else $varDummy = "''";
|
||||
apply xpath:"//dc:creator" if xpath:"string-length(.) > 0 and normalize-space(.) != ','" dc:creator = xpath:"normalize-space(.)"; else $varDummy = "''";
|
||||
if xpath:"//dc:title[string-length(.)> 0]" $varDummy = "''"; else dc:coverage = skipRecord();
|
||||
dc:title = xpath:"//dc:title[string-length(.) > 0]/normalize-space(.)";
|
||||
apply xpath:"//dc:subject" if xpath:"string-length(.) > 0" dc:subject = xpath:"normalize-space(.)"; else $varDummy = "''";
|
||||
apply xpath:"//dc:publisher" if xpath:"string-length(.) > 0" dc:publisher = xpath:"normalize-space(.)"; else $varDummy = "''";
|
||||
apply xpath:"//dc:source" if xpath:"string-length(.) > 0" dc:source = xpath:"normalize-space(.)"; else $varDummy = "''";
|
||||
dc:contributor = xpath:"//dc:contributor";
|
||||
// dc:description = xpath:"//dc:description/normalize-space(.)";
|
||||
//dc:description = xpath:"string-join(//dc:description/normalize-space(.), concat('; ',codepoints-to-string(10)))";
|
||||
dc:description = xpath:"string-join(//dc:description/normalize-space(.), '; ')";
|
||||
dc:format = xpath:"//dc:format";
|
||||
$varHttpTest = "''";
|
||||
oaf:fulltext = xpath:"//dc:identifier[starts-with(., 'http') and (ends-with(., 'document') or ends-with(., 'pdf'))]";
|
||||
//if xpath:"//dc:identifier[starts-with(., 'http') and (ends-with(., 'document') or ends-with(., 'pdf'))] or //dc:relation[starts-with(lower-case(normalize-space(.)), 'info:eu-repo/grantagreement')] or //dc:rights[starts-with(lower-case(normalize-space(.)), 'open') or contains(lower-case(normalize-space(.)), 'openaccess')] or //dc:accessRights[contains(lower-case(normalize-space(.)), 'openaccess')]" $var0 = "''"; else dc:coverage = skipRecord();
|
||||
if xpath:"//dc:identifier[starts-with(., 'http')]" $var0 = "''"; else dc:coverage = skipRecord();
|
||||
apply xpath:"//dc:identifier" if xpath:"starts-with(normalize-space(.), 'http')" dc:identifier = xpath:"normalize-space(.)"; else dr:CobjIdentifier = xpath:"normalize-space(.)";
|
||||
dr:dateOfCollection = xpath:"//dri:dateOfCollection";
|
||||
static dr:dateOfTransformation = xpath:"current-dateTime()";
|
||||
dc:type = xpath:"//dc:type";
|
||||
dc:format = xpath:"//dc:format";
|
||||
dc:date = xpath:"//dc:date";
|
||||
dc:language = Convert(xpath:"//dc:language", Languages);
|
||||
$varDateAccepted = Convert(xpath:"descendant-or-self::dc:date", DateISO8601, "yyyy-MM-dd", "min()");
|
||||
if xpath:"starts-with($varDateAccepted, '0')" oaf:dateAccepted = $varDummy; else oaf:dateAccepted = $varDateAccepted;
|
||||
$varEmbargoEnd = xpath:"//dc:date[matches(normalize-space(.), '(.*)(info:eu-repo/date/embargoEnd/)(\d\d\d\d-\d\d-\d\d)', 'i')][contains(lower-case(.), 'info:eu-repo')]/replace(normalize-space(.), '(.*)(info:eu-repo/date/embargoEnd/)(\d\d\d\d-\d\d-\d\d)', '$3', 'i')";
|
||||
oaf:embargoenddate = $varEmbargoEnd;
|
||||
// FP7
|
||||
oaf:projectid = xpath:"distinct-values(//dc:relation[matches(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/fp7/)(\d\d\d\d\d\d)(.*)', 'i')][year-from-date(xs:date(max(($varDateAccepted, '0001-01-01')))) gt 2006][contains(lower-case(.), 'info:eu-repo')]/concat($varFP7, replace(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/fp7/)(\d\d\d\d\d\d)(.*)', '$3', 'i')))";
|
||||
// H2020
|
||||
oaf:projectid = xpath:"distinct-values(//dc:relation[matches(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/h2020/)(\d\d\d\d\d\d)(.*)', 'i')][year-from-date(xs:date(max(($varDateAccepted, '0001-01-01')))) gt 2012][contains(lower-case(.), 'info:eu-repo')]/concat($varH2020, replace(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/h2020/)(\d\d\d\d\d\d)(.*)', '$3', 'i')))";
|
||||
// H2020 workaround for HAL
|
||||
oaf:projectid = xpath:"distinct-values(//dc:relation[matches(normalize-space(.), '(.*)(info:eu-repo/grantagreement//)(\d\d\d\d\d\d)(.*)', 'i')][//dc:contributor[contains(lower-case(.), 'h2020')]][year-from-date(xs:date(max(($varDateAccepted, '0001-01-01')))) gt 2012][contains(lower-case(.), 'info:eu-repo')]/concat($varH2020, replace(normalize-space(.), '(.*)(info:eu-repo/grantagreement//)(\d\d\d\d\d\d)(.*)', '$3', 'i')))";
|
||||
dc:relation = xpath:"//dc:relation";
|
||||
//comment-js-09-10-2012 apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics')" dc:rights = empty; else dc:rights = xpath:"normalize-space(.)";
|
||||
//
|
||||
oaf:collectedDatasourceid = xpath:"$varDatasourceid";
|
||||
//
|
||||
//if xpath:"//dc:type[1]/lower-case(.) = 'text'" dr:CobjCategory = Convert(xpath:"reverse(//dc:type) | //oai:setSpec", TextTypologies); else dr:CobjCategory = Convert(xpath:"//dc:type | //oai:setSpec", TextTypologies);
|
||||
$varCobjCategoryReverse = Convert(xpath:"insert-before(reverse(//dc:type) , 0, reverse(//oai:setSpec))", TextTypologies);
|
||||
$varSuperTypeReverse = Convert(xpath:"normalize-space($varCobjCategoryReverse)", SuperTypes);
|
||||
dr:CobjCategory = set(xpath:"//oaf:datasourceprefix[//dc:type[1]/lower-case(.) = ('text', 'info:eu-repo/semantics/other', 'other') or //oaf:datasourceprefix/lower-case(.) = 'openedition_']/$varCobjCategoryReverse", @type = $varSuperTypeReverse;);
|
||||
$varCobjCategoryStraight = Convert(xpath:"insert-before(//dc:type , 100, //oai:setSpec)", TextTypologies);
|
||||
$varSuperTypeStraight = Convert(xpath:"normalize-space($varCobjCategoryStraight)", SuperTypes);
|
||||
dr:CobjCategory = set(xpath:"//oaf:datasourceprefix[not(//dc:type[1]/lower-case(.) = ('text', 'info:eu-repo/semantics/other', 'other'))]/$varCobjCategoryStraight", @type = $varSuperTypeStraight;);
|
||||
//
|
||||
// review level
|
||||
// oaf:refereed = Convert(xpath:"//dc:description", ReviewLevels);
|
||||
$varRefereedConvt = Convert(xpath:"(//dc:type, //oai:setSpec, //dc:description)", ReviewLevels);
|
||||
$varRefereedDesct = xpath:"(//dc:description[matches(lower-case(.), '.*(this\s*book|it)\s*constitutes\s*the\s*(thoroughly\s*)?refereed') or matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*')]/'0001', //dc:description[matches(., '^version\s*(préliminaire.*|0$)')]/'0002')";
|
||||
$varRefereedIdntf = xpath:"(//*[string(node-name(.)) = 'dc:identifier' and matches(lower-case(.), '(^|.*[\.\-_/\s\(\)])pre[\.\-_/\s\(\)]?prints?([\.\-_/\s\(\)].*)?$')][count(//dc:identifier) = 1]/'0002', //*[string(node-name(.)) = 'dc:identifier' and matches(lower-case(.), '(^|.*[\.\-_/\s\(\)])refereed([\.\-_/\s\(\)\d].*)?$')]/'0001', //*[string(node-name(.)) = 'dc:identifier' and contains(lower-case(.), '-peer-reviewed-article-')]/'0001')";
|
||||
$varRefereed = xpath:"($varRefereedConvt, $varRefereedIdntf, $varRefereedDesct)";
|
||||
if xpath:"count(index-of($varRefereed, '0001')) >0" oaf:refereed = xpath:"'0001'"; else $varDummy= "''";
|
||||
if xpath:"count(index-of($varRefereed, '0002')) >0 and count(index-of($varRefereed, '0001')) = 0" oaf:refereed = xpath:"'0002'"; else $varDummy= "''";
|
||||
//
|
||||
apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics') and (xs:date( max( ($varEmbargoEnd, '0001-01-01') ) ) gt current-date())" oaf:accessrights = Convert(xpath:"normalize-space(.)", AccessRights); else dc:rights = xpath:".";
|
||||
// apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics') " oaf:accessrights = Convert(xpath:"normalize-space(.)", AccessRights); else dc:rights = xpath:".";
|
||||
//2021-06-01 ; acz ; next line to avoid to be OPEN as default, set to UNKNOWN , 2021-07-05 acz
|
||||
//if xpath:"//dc:rights[starts-with(normalize-space(.), 'info:eu-repo/semantics') and not(xs:date( max( ($varEmbargoEnd, '0001-01-01') ) ) lt current-date())]" $var0 = "''"; else oaf:accessrights = "UNKNOWN";
|
||||
oaf:license = xpath:"//dc:rights[starts-with(., 'http') or matches(., '^CC[- ]BY([- ](NC([- ](ND|SA))?|ND|SA))([- ]\d(\.\d)?)?$', 'i')]";
|
||||
//
|
||||
static oaf:collectedFrom = set("''", @name = $varOfficialname; , @id = $varDatasourceid;);
|
||||
static oaf:hostedBy = set("''", @name = $varOfficialname; , @id = $varDatasourceid;);
|
||||
//
|
||||
//$varId = identifierExtract('["//dc:identifier", "//dc:relation"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/<>]*/[^\s"<>]+)');
|
||||
$varIdDoi = identifierExtract('["//dc:identifier[starts-with(., \"info:\") or starts-with(., \"urn:\") or starts-with(., \"doi:\") or starts-with(., \"DOI:\") or starts-with(., \"Doi:\") or starts-with(., \"doi \") or starts-with(., \"DOI \") or starts-with(., \"Doi \") or starts-with(., \"10.\") or ((starts-with(., \"http\")) and contains(., \"doi.org/10.\"))]", "//dc:relation[starts-with(., \"info:eu-repo/semantics/altIdentifier/doi/10.\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/DOI/10.\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/Doi/10.\") or ((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/doi/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/DOI/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/Doi/http\")) and contains(., \"doi.org/10.\"))]"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/<>]*/[^\s"<>]+)');
|
||||
|
||||
$varIdHdl = identifierExtract('["//dc:identifier[starts-with(., \"HDL:\") and not(starts-with(., \"HDL: http\"))][not(contains(., \"123456789\"))]", "//dc:relation[starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/hdl/\") or (starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/url/\") and contains(., \"://hdl.handle.net/\"))]"]' , xpath:"./*[local-name()='record']" , '(?!(info:hdl:|://hdl.handle.net/|info:eu-repo/semantics/altIdentifier/hdl/))(\d.*)');
|
||||
|
||||
|
||||
$varIdIsbn = xpath:"(//dc:identifier, //dc:source)[starts-with(lower-case(.), 'isbn') or starts-with(., '978') or starts-with(., '979')][(matches(., '(isbn[:\s]*)?97[89]-\d+-\d+-\d+-\d+$', 'i') and string-length(concat('97', substring-after(., '97'))) = 17) or matches(., '(isbn[:\s]*)?97[89]\d{10}$', 'i')]/replace(., 'isbn[:\s]*', '', 'i'), //dc:relation[starts-with(lower-case(.), 'info:eu-repo/semantics/altidentifier/isbn/')][(matches(., 'info:eu-repo/semantics/altIdentifier/isbn/97[89]-\d+-\d+-\d+-\d+$', 'i') and string-length(.) = 59) or matches(., 'info:eu-repo/semantics/altidentifier/isbn/97[89]\d{10}$', 'i')]/substring-after(lower-case(.), 'info:eu-repo/semantics/altidentifier/isbn/')";
|
||||
|
||||
$varIdBibc = identifierExtract('["//dc:identifier[starts-with(., \"BibCode:\") or starts-with(., \"BIBCODE:\") or (starts-with(., \"http:\") and contains(., \"bibcode=\"))]"]' , xpath:"./*[local-name()='record']" , '(^(BibCode:|BIBCODE:|http).*$)');
|
||||
|
||||
$varIdPtnt = identifierExtract('["//dc:identifier[starts-with(., \"Patent N°:\")]"]' , xpath:"./*[local-name()='record']" , '(^Patent N°:.*$)');
|
||||
|
||||
$varPmId = identifierExtract('["//dc:identifier[starts-with(normalize-space(.), \"PUBMED:\")]"]' , xpath:"./*[local-name()='record']" , '(?!PUBMED: )(\d+)');
|
||||
|
||||
$varIdPmc = identifierExtract('["//dc:identifier[starts-with(., \"PUBMEDCENTRAL:\") or (starts-with(., \"http\") and contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/PMC\"))]", "//dc:relation[starts-with(., \"info:eu-repo/semantics/altIdentifier/pmid/PMC\") or ((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/pmid/http\")) and contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/PMC\"))]"]' , xpath:"./*[local-name()='record']" , '(PMC\d+)');
|
||||
|
||||
//$varIdHal = identifierExtract('["//dc:identifier[starts-with(., \"ads-\") or starts-with(., \"anses-\") or starts-with(., \"artxibo-\") or starts-with(., \"bioemco-\") or starts-with(., \"cea-\") or starts-with(., \"cel-\") or starts-with(., \"cirad-\") or starts-with(., \"edutice-\") or starts-with(., \"emse-\") or starts-with(., \"EMSE-\") or starts-with(., \"ensl-\") or starts-with(., \"hal-\") or starts-with(., \"HAL-\") or starts-with(., \"halsde-\") or starts-with(., \"halshs-\") or starts-with(., \"hprints-\") or starts-with(., \"in2p3-\") or starts-with(., \"ineris-\") or starts-with(., \"inria-\") or starts-with(., \"Inria-\") or starts-with(., \"inserm-\") or starts-with(., \"insu-\") or starts-with(., \"INSU-\") or starts-with(., \"ird-\") or starts-with(., \"irsn-\") or starts-with(., \"jpa-\") or starts-with(., \"lirmm-\") or starts-with(., \"medihal-\") or starts-with(., \"meteo-\") or starts-with(., \"mnhn-\") or starts-with(., \"obspm-\") or starts-with(., \"pastel-\") or starts-with(., \"pasteur-\") or starts-with(., \"Pasteur-\") or starts-with(., \"peer-\") or starts-with(., \"ssa-\") or starts-with(., \"tel-\") or starts-with(., \"ujm-\") or starts-with(., \"ijn_\") or starts-with(., \"sic_\") or (starts-with(., \"http\") and (contains(., \"://hal.archives-ouvertes.fr/hal\") or contains(., \"://halshs.archives-ouvertes.fr/hal\") or contains(., \"://halsde.archives-ouvertes.fr/hal\") or contains(., \"://medihal.archives-ouvertes.fr/hal\")))]", "//dc:relation[((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\")) and (contains(., \"://hal.archives-ouvertes.fr/hal\") or contains(., \"://halshs.archives-ouvertes.fr/hal\") or contains(., \"://halsde.archives-ouvertes.fr/hal\")))]"]' , xpath:"./*[local-name()='record']" , '((ads|anses|artxibo|bioemco|cea|cel|cirad|edutice|emse|EMSE|ensl|hal|HAL|halsde|halshs|hprints|in2p3|ineris|inria|Inria|inserm|insu|INSU|ird|irsn|jpa|lirmm|medihal|meteo|mnhn|obspm|pastel|pasteur|Pasteur|peer|ssa|tel|ujm)-|(ijn|sic)_).*');
|
||||
$varIdHal = identifierExtract('["//*[local-name() = \"recordIdentifier\"]"]' , xpath:"./*[local-name()='record']" , '(oai:HAL:.*)');
|
||||
|
||||
$varIdArxv = identifierExtract('["//dc:identifier[((starts-with(., \"http\") or starts-with(., \"ArXiv: http\")) and (contains(., \"://arxiv.org/abs/\") or contains(., \"://arxiv.org/pdf/\"))) or starts-with(., \"arXiv:\") or starts-with(., \"ARXIV:\")]", "//dc:relation[(starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/arxiv/\") and not(contains(., \"/arxiv/http\"))) or ((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\") or starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/arxiv/http\")) and (contains(., \"://arxiv.org/abs/\") or contains(., \"://arxiv.org/pdf/\")))]"]' , xpath:"./*[local-name()='record']" , '(?!(://arxiv.org/abs/|:eu-repo/semantics/altIdentifier/arxiv/))([a-zA-Z].*)');
|
||||
|
||||
$varIdWos = identifierExtract('["//dc:identifier[starts-with(., \"WOS:\") or starts-with(., \"wos: WOS:\")]", "//dc:relation[starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/wos/\")]"]' , xpath:"./*[local-name()='record']" , '(info.*|WOS:.+|wos: WOS:.+)');
|
||||
|
||||
//oaf:identifier = set(xpath:"$varId//value[not[. = '10.1145/nnnnnnn.nnnnnnn']]", @identifierType = "doi";);
|
||||
oaf:identifier = set(xpath:"$varIdDoi//value[not(. = '10.1145/nnnnnnn.nnnnnnn')]", @identifierType = "doi";);
|
||||
oaf:identifier = set(xpath:"$varIdHdl//value", @identifierType = "handle";);
|
||||
oaf:identifier = set(xpath:"$varIdIsbn", @identifierType = "isbn";);
|
||||
|
||||
oaf:identifier = set(xpath:"($varIdBibc//value[not(starts-with(., 'http'))]/replace(., 'BIBCODE:\s*', ''), $varIdBibc//value[starts-with(., 'http') and contains(substring-after(., 'bibcode='), codepoints-to-string(38))]/substring-before(substring-after(., 'bibcode='), codepoints-to-string(38)), $varIdBibc//value[starts-with(., 'http') and not(contains(substring-after(., 'bibcode='), codepoints-to-string(38)))]/substring-after(., 'bibcode='))", @identifierType = "bibcode";);
|
||||
|
||||
oaf:identifier = set(xpath:"$varIdPtnt//value/normalize-space(substring-after(., 'Patent N°:'))", @identifierType = "patentNumber";);
|
||||
|
||||
oaf:identifier = set(xpath:"$varPmId//value", @identifierType = "pmid";);
|
||||
oaf:identifier = set(xpath:"$varIdPmc//value", @identifierType = "pmcid";);
|
||||
//oaf:identifier = set(xpath:"distinct-values($varIdHal//value/replace(., '(/document|/image|/file/.*)$', ''))", @identifierType = "hal";);
|
||||
oaf:identifier = set(xpath:"distinct-values($varIdHal//value/replace(substring-after(., 'oai:HAL:'), '(v\d*)$', ''))", @identifierType = "hal";);
|
||||
oaf:identifier = set(xpath:"distinct-values(($varIdArxv//value/normalize-space(replace(., '(https?://arxiv.org/abs/|https?://arxiv.org/pdf/|info:eu-repo/semantics/altIdentifier/arxiv/|info:eu-repo/semantics/altIdentifier/url/|info:eu-repo/semantics/altIdentifier/urn/|arXiv:|\.pdf)', '', 'i'))))", @identifierType = "arxiv";);
|
||||
oaf:identifier = set(xpath:"$varIdWos//value/normalize-space(replace(., '(info:eu-repo/semantics/altIdentifier/wos/|WOS:|wos:)', ''))", @identifierType = "wos";);
|
||||
|
||||
oaf:identifier = set(xpath:"distinct-values(//dc:identifier[starts-with(., 'http') and contains(., $varIdHal//value/replace(substring-after(., 'oai:HAL:'), '(v\d*)$', ''))]/replace(., '(/document|/image|/file/.*)$', ''))", @identifierType = "landingPage";);
|
||||
oaf:identifier = set(xpath:"distinct-values(//dc:identifier[starts-with(., 'http') and not(ends-with(., $varIdHal//value/replace(substring-after(., 'oai:HAL:'), '(v\d*)$', '')))])", @identifierType = "url";);
|
||||
|
||||
oaf:identifier = set(xpath:"//dri:recordIdentifier", @identifierType = "oai-original";);
|
||||
|
||||
oaf:datasourceprefix = xpath:"//oaf:datasourceprefix";
|
||||
|
||||
// journal data
|
||||
// avoiding regular expressions, while a) correcting ISSNs with no - or other letters instead of - and b) ignoring any stuff after the ISSN (as e.g. print/online/...)
|
||||
$varISSN = xpath:"//dc:source[starts-with(., 'ISSN:') and string-length(.) > 12]/concat(substring(normalize-space(substring-after(., 'ISSN:')), 1, 4), '-', normalize-space(substring-after(., substring(normalize-space(substring-after(., 'ISSN:')), 1, 4))))";
|
||||
//$varEISSN = xpath:"//dc:source[starts-with(., 'EISSN:') and string-length(.) > 13]/normalize-space(substring-after(., 'ISSN:'))";
|
||||
$varEISSN = xpath:"//dc:source[starts-with(., 'EISSN:') and string-length(.) > 13]/concat(substring(normalize-space(substring-after(., 'EISSN:')), 1, 4), '-', normalize-space(substring-after(., substring(normalize-space(substring-after(., 'EISSN:')), 1, 4))))";
|
||||
oaf:journal = set(xpath:"//oaf:datasourceprefix[$varISSN or $varEISSN]/''", @issn = xpath:"$varISSN";, @eissn = xpath:"$varEISSN";);
|
||||
|
||||
end
|
|
@ -0,0 +1,140 @@
|
|||
// from PROD 2021-07-06 , tf script of DOAJ with more than 6mill. records
|
||||
declare_script "dc_cleaning_OpenAIREplus_compliant_doaj";
|
||||
declare_ns oaf = "http://namespace.openaire.eu/oaf";
|
||||
declare_ns dri = "http://www.driver-repository.eu/namespace/dri";
|
||||
declare_ns dr = "http://www.driver-repository.eu/namespace/dr";
|
||||
declare_ns dc = "http://purl.org/dc/elements/1.1/";
|
||||
declare_ns prov = "http://www.openarchives.org/OAI/2.0/provenance";
|
||||
$var0 = "''";
|
||||
$varFP7 = "'corda_______::'";
|
||||
$varH2020 = "'corda__h2020::'";
|
||||
$varDummy = "''";
|
||||
// $varUnknownRepoId = "'openaire____::55045bd2a65019fd8e6741a755395c8c'";
|
||||
//
|
||||
$varUnknownRepoId = "'openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18'";
|
||||
$varUnknownRepoName = "'Unknown Repository'";
|
||||
static $varDatasourceid = getValue(PROFILEFIELD, [xpath:"concat('collection(&apos;/db/DRIVER/RepositoryServiceResources&apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&quot;NamespacePrefix&quot;][value=&quot;', //oaf:datasourceprefix, '&quot;]]')", xpath:"//EXTRA_FIELDS/FIELD[key='OpenAireDataSourceId']/value"]);
|
||||
static $varRepoid = xpath:"//dri:repositoryId";
|
||||
static $varOfficialname = getValue(PROFILEFIELD, [xpath:"concat('collection(&apos;/db/DRIVER/RepositoryServiceResources&apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&quot;NamespacePrefix&quot;][value=&quot;', //oaf:datasourceprefix, '&quot;]]')", xpath:"//CONFIGURATION/OFFICIAL_NAME"]);
|
||||
dri:objIdentifier = xpath:"//dri:objIdentifier";
|
||||
dri:repositoryId = $varRepoid;
|
||||
dri:recordIdentifier = xpath:"//dri:recordIdentifier";
|
||||
|
||||
if xpath:"//dc:creator[string-length(normalize-space(.)) &gt; 0][contains(., 'CDATA')][starts-with(normalize-space(.), '(')][starts-with(normalize-space(.), '.')]" dc:creator = skipRecord(); else $varDummy = "''";
|
||||
//apply xpath:"//dc:creator" if xpath:"string-length(normalize-space(.)) &amp;gt; 0 and not(contains(., 'CDATA')) and not(starts-with(normalize-space(.), '.')) and not(starts-with(normalize-space(.), '('))" dc:creator = Convert(xpath:".", Person); else $varDummy = "''";
|
||||
if xpath:"count(//dc:creator) = 0" dc:creator = skipRecord(); else $varDummy = "''";
|
||||
//apply xpath:"//dc:creator" if xpath:"string-length(.) &gt; 0 and normalize-space(.) != ','" dc:creator = xpath:"normalize-space(.)"; else $varDummy = "''";
|
||||
$varOrcidName = xpath:"//dc:creator[string-length(normalize-space(.)) > 0]";
|
||||
$varOrcidOrcid = xpath:"//dc:creator[string-length(normalize-space(.)) > 0]/@id/replace(., 'https?://orcid.org/', '')";
|
||||
dc:creator = set(xpath:"$varOrcidName", @nameIdentifier = xpath:"subsequence($varOrcidOrcid,position(),1)";, @nameIdentifierScheme=xpath:"replace(subsequence($varOrcidOrcid,position(),1),'^.+$','ORCID')";, @schemeUri=xpath:"replace(subsequence($varOrcidOrcid,position(),1),'^.+$','http://orcid.org/')";);
|
||||
|
||||
if xpath:"count(//dc:title[string-length(.) &gt; 0]) = 0" dc:title = skipRecord(); else $varDummy = "''";
|
||||
dc:title = xpath:"//dc:title/normalize-space(replace(., '^(&lt;title language=)(.)*(&gt;)', ''))";
|
||||
// apply xpath:"//dc:title" if xpath:"string-length(.) &gt; 0" dc:title = xpath:"normalize-space(.)"; else $varDummy = "''";
|
||||
|
||||
apply xpath:"//dc:subject" if xpath:"string-length(.) &gt; 0 and not(@xsi:type = 'dcterms:LCSH')" dc:subject = xpath:"normalize-space(.)"; else $varDummy = "''";
|
||||
dc:subject = set(xpath:"//dc:subject[@xsi:type = 'dcterms:LCSH']/concat('lcsh:', .)", @classid=xpath:"'lcsh'";, @classname=xpath:"'lcsh'";, @schemeid=xpath:"'dnet:subject_classification_typologies'";, @schemename=xpath:"'dnet:subject_classification_typologies'";);
|
||||
|
||||
apply xpath:"//dc:publisher" if xpath:"string-length(.) &gt; 0" dc:publisher = xpath:"normalize-space(replace(., '(&lt;br&gt;)', ''))"; else $varDummy = "''";
|
||||
apply xpath:"//dc:source" if xpath:"string-length(.) &gt; 0" dc:source = xpath:"normalize-space(.)"; else $varDummy = "''";
|
||||
dc:contributor = xpath:"//dc:contributor";
|
||||
dc:description = xpath:"//dc:description[not(starts-with(., 'URN: urn:nbn:') or starts-with(., 'URN: http'))]";
|
||||
dc:format = xpath:"//dc:format";
|
||||
$varHttpTest = "''";
|
||||
if xpath:"//dc:relation[starts-with(., 'http') or starts-with(., 'www.')]" $varHttpTest = "true"; else dc:identifier = skipRecord();
|
||||
//apply xpath:"//dc:relation" if xpath:"starts-with(normalize-space(.), 'http')" dc:identifier = xpath:"normalize-space(.)"; else dr:CobjIdentifier = xpath:"normalize-space(.)";
|
||||
//apply xpath:"//dc:relation" if xpath:"starts-with(normalize-space(.), 'www.')" dc:identifier = xpath:"concat('http://', normalize-space(.))"; else dr:CobjIdentifier = xpath:"normalize-space(.)";
|
||||
dr:CobjIdentifier = xpath:"distinct-values(//dc:identifier[not(starts-with(normalize-space(.), 'http'))][not(normalize-space(.) = ($varIdList))][not(starts-with(normalize-space(.), 'urn:nbn:') or starts-with(normalize-space(.), 'URN:NBN:'))][not(. = ($varISSN[1], $varISSN[2]))][normalize-space(.) != ''])";
|
||||
dc:identifier = xpath:"($varIdUrl//value[not(starts-with(., 'www'))], $varIdUrl//value[starts-with(., 'www')]/concat('http://', .), $varIdLdpg//value, $varIdDoi//value)[1]";
|
||||
dc:relation = xpath:"//dc:relation[starts-with(., 'https://doaj.org/toc/')]";
|
||||
|
||||
dr:dateOfCollection = xpath:"//dri:dateOfCollection";
|
||||
static dr:dateOfTransformation = xpath:"current-dateTime()";
|
||||
// dc:type = xpath:"//dc:type";
|
||||
dc:language = Convert(xpath:"//dc:language", Languages);
|
||||
//if xpath:"//dc:rights[text()='info:eu-repo/semantics/openAccess']" dc:publisher = xpath:"//dc:publisher"; else dc:publisher = skipRecord();
|
||||
dc:date = xpath:"//dc:date";
|
||||
oaf:dateAccepted = Convert(xpath:"descendant-or-self::dc:date", DateISO8601, "yyyy-MM-dd", "min()");
|
||||
apply xpath:"//dc:date" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/date')" oaf:embargoenddate = RegExpr(xpath:"normalize-space(.)", $var0, "s/^(.*info:eu-repo\/date\/embargoEnd\/)//gmi"); else $var0 = "''";
|
||||
//apply xpath:"//dc:relation" if xpath:"string-length(substring-after(normalize-space(.), 'info:eu-repo/grantAgreement/EC/FP7/')) = 6" oaf:projectid = RegExpr(xpath:"normalize-space(.)", $var1, "s/^(.*info:eu-repo\/grantAgreement\/EC\/FP7\/)//gmi"); else dc:relation = xpath:"normalize-space(.)";
|
||||
//comment-js-09-10-2012 apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics')" dc:rights = empty; else dc:rights = xpath:"normalize-space(.)";
|
||||
//
|
||||
oaf:collectedDatasourceid = $varDatasourceid;
|
||||
//
|
||||
// apply xpath:"//dc:type" if xpath:"." dr:CobjCategory = Convert(xpath:"normalize-space(.)", TextTypologies); else dc:type = xpath:".";
|
||||
//dr:CobjCategory = "0001";
|
||||
$varCobjCategory = Convert(xpath:"//dc:type", TextTypologies);
|
||||
$varSuperType = Convert(xpath:"normalize-space($varCobjCategory)", SuperTypes);
|
||||
dr:CobjCategory = set($varCobjCategory, @type = $varSuperType;);
|
||||
dc:type = xpath:"//dc:type";
|
||||
//
|
||||
// review status
|
||||
|
||||
$varRefereedIdntf = xpath:"(//*[string(node-name(.)) = 'dc:identifier' and matches(., '^(https?://(dx\.)?doi.org/)?10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$')]/'0001', //*[string(node-name(.)) = 'dc:relation' and matches(., '^info:eu-repo/semantics/altIdentifier/doi/10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$', 'i')]/'0001')";
|
||||
|
||||
$varRefereedProse = xpath:"(//*[string(node-name(.)) = 'dc:description' and matches(lower-case(.), '.*this\s*preprint\s*has\s*been\s*reviewed\s*and\s*recommended\s*by\s*peer\s*community') and contains(., '10.24072/')]/'0001', //dc:title[matches(lower-case(.), '.*\[.*peer[\s\-\._]*review\s*:.*\]\s*$')]/'0001')";
|
||||
$varRefereedReltn = xpath:"(//dc:relation, //dc:identifier)[contains(., '://www.dovepress.com/') and matches(lower-case(.), '.*-peer-reviewed-(fulltext-)?article-.*')]/'0001'";
|
||||
$varRefereedTitle = xpath:"//dc:title[matches(lower-case(.), '.*\[.*peer[\s\-\._]*review\s*:.*\]\s*$')]/'0001'";
|
||||
$varRefereedDesct = xpath:"(//dc:description[matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*') or matches(lower-case(.), '(this|a)\s*(article|preprint)\s*(has\s*been\s*)?(peer[\-\s]*)?reviewed\s*and\s*recommended\s*by\s*peer[\-\s]*community')]/'0001')";
|
||||
$varRefereed = xpath:"($varRefereedIdntf, $varRefereedProse, $varRefereedReltn, $varRefereedTitle, $varRefereedDesct)";
|
||||
//if xpath:"$varRefereed" oaf:refereed = xpath:"'0001'"; else $varDummy= "''";
|
||||
if xpath:"count(index-of($varRefereed, '0001')) >0" oaf:refereed = xpath:"'0001'"; else $varDummy= "''";
|
||||
if xpath:"count(index-of($varRefereed, '0002')) >0 and count(index-of($varRefereed, '0001')) = 0" oaf:refereed = xpath:"'0002'"; else $varDummy= "''";
|
||||
//
|
||||
apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics')" oaf:accessrights = Convert(xpath:"normalize-space(.)", AccessRights); else dc:rights = xpath:".";
|
||||
if xpath:"//dc:rights[starts-with(normalize-space(.), 'info:eu-repo/semantics')]" $var0 = "''"; else oaf:accessrights = "OPEN";
|
||||
//if xpath:"count(//dc:rights) = 0" oaf:accessrights = "OPEN"; else $var0 = "''";
|
||||
// oaf:accessrights = Convert(xpath:"normalize-space(//dc:rights)", AccessRights);
|
||||
oaf:license = xpath:"(//dc:rights, //dc:relation)[starts-with(normalize-space(.), 'http') and (contains(., '/licenses/') or contains(., '/licence/') or contains(., '/licencias/') or contains(., '/licencia/') or contains(., '://creativecommons.org/') or contains(., '://rightsstatements.org/')) or matches(., '^CC[- ]BY([- ](NC([- ](ND|SA))?|ND|SA))([- ]\d(\.\d)?)?$', 'i')][not(contains(normalize-space(.), ' '))]/normalize-space(.)";
|
||||
//
|
||||
static oaf:collectedFrom = set("''", @name = $varOfficialname; , @id = $varDatasourceid;);
|
||||
static oaf:hostedBy = set("''", @name = $varOfficialname; , @id = $varDatasourceid;);
|
||||
//
|
||||
//$varId = identifierExtract('["//dc:identifier", "//dc:relation"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/&lt;&gt;]*/[^\s"&lt;&gt;]+)');
|
||||
$varIdDoi = identifierExtract('["//dc:identifier[starts-with(., \"10.\") or starts-with(., \"DOI:\") or starts-with(., \"doi:\") or (starts-with(., \"http\") and contains(., \"doi.org/\"))]", "//dc:relation[starts-with(., \"10.\") or starts-with(., \"DOI:\") or starts-with(., \"doi:\") or (starts-with(., \"http\") and contains(., \"doi.org/\"))]"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/<>]*/[^\s"<>]+)');
|
||||
$varIdHdl = identifierExtract('["//dc:relation[starts-with(., \"http\") and contains(., \"://hdl.handle.net/\")][not(contains(., \"123456789\"))]"]' , xpath:"./*[local-name()='record']" , '(?!(://hdl.handle.net/))(\d.*)');
|
||||
$varIdUrn = identifierExtract('["//dc:relation[starts-with(., \"urn:nbn:\") or starts-with(., \"URN:NBN:\") or (starts-with(., \"http\") and (contains(., \"://nbn-resolving.org/urn:nbn:\") or contains(., \"://nbn-resolving.de/urn/resolver.pl?urn:nbn:\") or contains(., \"://nbn-resolving.de/urn:nbn:\") or contains(., \"://resolver.obvsg.at/urn:nbn:\") or contains(., \"://urn.fi/URN:NBN:\") or contains(., \"://urn.kb.se/resolve?urn=urn:nbn:\")))]", "//dc:description[contains(., \"URN: urn:nbn:de:0114-\") or contains(., \"URN: http://nbn-resolving.de/urn:nbn:de:0114-\") or (contains(., \"URN:NBN:no-\") and //dc:identifier = \"1893-1774\")]"]' , xpath:"./*[local-name()='record']" , '((urn:nbn:|URN:NBN:).*)');
|
||||
$varIdArk = identifierExtract('["//dc:relation[starts-with(normalize-space(.), \"http\") and contains(., \"/ark:\")]"]' , xpath:"./*[local-name()='record']" , '(http.*)');
|
||||
$varIdPmid = identifierExtract('["//dc:relation[starts-with(., \"http\") and contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/pmid/\")]"]' , xpath:"./*[local-name()='record']" , '(http.*)');
|
||||
$varIdPmc = identifierExtract('["//dc:relation[starts-with(., \"http\") and (contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/PMC\") or contains(., \"//europepmc.org/articles/PMC\"))]"]' , xpath:"./*[local-name()='record']" , '(http.*)');
|
||||
$varIdHal = identifierExtract('["//dc:relation[starts-with(., \"hal-\") or starts-with(., \"halshs-\") or starts-with(., \"halsde-\") or (starts-with(., \"http\") and (contains(., \"://hal.archives-ouvertes.fr/hal\") or contains(., \"://halshs.archives-ouvertes.fr/hal\") or contains(., \"://halsde.archives-ouvertes.fr/hal\")))]"]' , xpath:"./*[local-name()='record']" , '(hal(shs|sde)?-.*)');
|
||||
$varIdArxv = identifierExtract('["//dc:relation[starts-with(., \"http\") and (contains(., \"://arxiv.org/pdf/\") or contains(., \"://arxiv.org/abs/\"))]"]' , xpath:"./*[local-name()='record']" , '(\d.*)');
|
||||
$varIdLdpg = identifierExtract('["//dc:identifier[starts-with(., \"https://doaj.org/article/\")]"]', xpath:"./*[local-name()='record']" , '(http.*)');
|
||||
$varIdUrl = identifierExtract('["//dc:relation[starts-with(., \"http\")][not(contains(., \"://doaj.org\"))][not(contains(., \"doi.org/\"))][not(contains(., \"hdl.handle.net/\"))][not(contains(., \"://nbn-resolving.de/\") or contains(., \"://nbn-resolving.org/\") or contains(., \"://resolver.obvsg.at/\") or contains(., \"://urn.fi/URN:NBN:\") or contains(., \"://urn.kb.se/resolve\"))][not(contains(., \"://arxiv.org/pdf/\") or contains(., \"://arxiv.org/abs/\"))][not(contains(., \"://localhost/\") or contains(., \"://localhost:\"))]", "//dc:relation[starts-with(., \"www\")]"]', xpath:"./*[local-name()='record']" , '((http|www).*)');
|
||||
|
||||
$varIdList = xpath:"(($varIdDoi//value, $varIdHdl//value, $varIdUrn//value, $varIdArk//value, $varIdPmid//value, $varIdPmc//value, $varIdLdpg//value, $varIdUrl//value))";
|
||||
|
||||
// dropping/cleaning wrong DOIs, as
|
||||
// 2 DOIs just different in 1 ending with . (mostly, but not exclusively, prefixed with 10.5216)
|
||||
// noise stemming from odd/wrong DOI statements' formats
|
||||
// DOIs with 2 prefixes
|
||||
// DOI statements containing first the DOI prefix and then the DOI incl. the resolver prefix
|
||||
//oaf:identifier = set(xpath:"$varId//value", @identifierType = "doi";);
|
||||
//oaf:identifier = set(xpath:"$varIdDoi//value", @identifierType = "doi";);
|
||||
|
||||
oaf:identifier = set(xpath:"distinct-values(($varIdDoi//value[not(ends-with(., '.') and exists(index-of($varIdDoi//value, substring(., 1, string-length(.)-1))))][not(. = '10.4313/article-4')][not(lower-case(.) = ('10.30659/ijibe.2.1.171-181', '10.30659/ijibe.2.1.171', '10.26843/rencima.v8i4.149', '10.26843/rencima.v11i1.215', '10.18273/revfue.v14n2-2016002revista', '10.17061/phrp3112015', '10.21789/24222704', '10.22432/pjsr.2017.14.', '10.22432/pjsr.2017.18.02', '10.22432/pjsr.2017.18.'))][not(starts-with(., '10.1530/VAB-'))][not(starts-with(lower-case(.), '10.1155/s168761720'))][not(starts-with(., '10.15561/10.6084/') or starts-with(., '10.5935/10.19180/'))][not(starts-with(., '10.7454/jvi.v') and string-length(.) = 16)][not(starts-with(., '10.15094/0000') and string-length(.) = 16)][not(matches(., '^10\.\d*/DOI:$'))][not(starts-with(., concat(substring-before(., '/'), '/', substring-before(., '/'), '/')))][not(matches(substring-after(., '/'), '^https?://(dx.)?doi.org/.*') and starts-with(substring-after(., 'doi.org/'), substring-before(., '/')))][not(starts-with(., '10.1371/journal.') and matches(., '^10\.1371/journal\.[a-z]{4}\.\d{7}\.(eor|20050521)$'))][not(substring-before(., '/') = ('10.19183', '10.18066') and matches(., '^(10\.19183/how\.\d*\.\d*|10\.18066/revunivap\.v\d*i\d*)$'))]/lower-case(.), $varIdDoi//value[matches(substring-after(., '/'), '^https?://(dx.)?doi.org/.*') and starts-with(substring-after(., 'doi.org/'), substring-before(., '/'))]/substring-after(., 'doi.org/'), $varIdDoi//value[starts-with(., '10.1371/journal.') and matches(., '^10\.1371/journal\.[a-z]{4}\.\d{7}\.eor$')]/substring(., 1, 28), $varIdDoi//value[starts-with(., '10.15561/10.6084/') or starts-with(., '10.5935/10.19180/')]/substring-after(., '/')))", @identifierType = "doi";);
|
||||
|
||||
oaf:identifier = set(xpath:"distinct-values($varIdHdl//value/normalize-space(replace(., '\?locatt=view:master', '')))", @identifierType = "handle";);
|
||||
oaf:identifier = set(xpath:"$varIdUrn//value", @identifierType = "urn";);
|
||||
oaf:identifier = set(xpath:"distinct-values($varIdArk//value/replace(substring-after(., '/ark:'), '^/', ''))", @identifierType = "ark";);
|
||||
oaf:identifier = set(xpath:"distinct-values($varIdPmid//value/replace(., 'https?://www.ncbi.nlm.nih.gov/pmc/articles/pmid/(\d+)(/.*)?', '$1'))", @identifierType = "pmid";);
|
||||
oaf:identifier = set(xpath:"distinct-values($varIdPmc//value/replace(., 'https?://(www.ncbi.nlm.nih.gov/pmc|europepmc.org)/articles/(PMC\d*)([/\?].*)?', '$2'))", @identifierType = "pmcid";);
|
||||
oaf:identifier = set(xpath:"distinct-values($varIdHal//value/replace(., '/document', ''))", @identifierType = "hal";);
|
||||
oaf:identifier = set(xpath:"$varIdArxv//value", @identifierType = "arxiv";);
|
||||
oaf:identifier = set(xpath:"$varIdLdpg//value", @identifierType = "landingPage";);
|
||||
oaf:identifier = set(xpath:"($varIdUrl//value[not(starts-with(., 'www'))], $varIdUrl//value[starts-with(., 'www')]/concat('http://', .))", @identifierType = "url";);
|
||||
|
||||
oaf:datasourceprefix = xpath:"//oaf:datasourceprefix";
|
||||
|
||||
//$varJournalName = xpath:"substring-before(//dc:source, ',')";
|
||||
$varJournalTitle = xpath:"(//dc:source[contains(., ', Vol ')]/substring-before(., ', Vol '), //dc:source[contains(., ', Iss ')]/substring-before(., ', Iss '))[1]";
|
||||
$varVol = xpath:"//dc:source[contains(., ', Vol ')][matches(., ', Vol \d+')]/replace(substring-after(., ', Vol '), '^(\d+).*$', '$1')";
|
||||
$varIss = xpath:"//dc:source[contains(., ', Iss ')][matches(., ', Iss \d+')]/replace(substring-after(., ', Iss '), '^(\d+).*$', '$1')";
|
||||
$varSp = xpath:"//dc:source[contains(., ', Pp ')][matches(., ', Pp \d+-\d+')]/substring-before(substring-after(., ', Pp '), '-')";
|
||||
$varEp = xpath:"//dc:source[contains(., ', Pp ')][matches(., ', Pp \d+-\d+')]/replace(substring-after(substring-after(., ', Pp '), '-'), '^(\d+).*$', '$1')";
|
||||
$varISSN = xpath:"//dc:identifier[string-length() = 9 and matches(., '(\d{4})-(\d{4})')][1]";
|
||||
//oaf:journal = set($varJournalName, @issn = xpath:"//dc:identifier[string-length() = 9 and matches(., '^(\d{4})-(\d{4}|\d{3}X)')][1]"; , @eissn = xpath:"//dc:identifier[string-length() = 9 and matches(., '^(\d{4})-(\d{4}|\d{3}X)')][2]";);
|
||||
//oaf:journal = set($varJournalName, @issn = xpath:"//dc:identifier[string-length() = 9]";);
|
||||
oaf:journal = set($varJournalTitle, @issn = xpath:"//dc:identifier[string-length() = 9 and matches(., '(\d{4})-(\d{4})')][1]";, @eissn = xpath:"//dc:identifier[string-length() = 9 and matches(., '(\d{4})-(\d{4})')][2]";, @vol = xpath:"$varVol";, @iss = xpath:"$varIss";, @sp = xpath:"$varSp";, @ep = xpath:"$varEp";);
|
||||
|
||||
end
|
|
@ -0,0 +1,492 @@
|
|||
<!-- from PROD 2021-06-14 -->
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.1"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns:transformExt="http://namespace.openaire.eu/java/org.apache.commons.codec.digest.DigestUtils"
|
||||
xmlns:TransformationFunction="eu.dnetlib.data.collective.transformation.core.xsl.ext.TransformationFunctionProxy"
|
||||
extension-element-prefixes="transformExt TransformationFunction"
|
||||
exclude-result-prefixes="transformExt TransformationFunction" >
|
||||
<xsl:output indent="yes" omit-xml-declaration="yes"/>
|
||||
|
||||
<!--
|
||||
<xsl:param name="varHostedById" select="'opendoar____::908'"/>
|
||||
<xsl:param name="varHostedByName" select="'Europe PubMed Central'"/>
|
||||
-->
|
||||
|
||||
<xsl:param name="varOfficialName" />
|
||||
<xsl:param name="varDsType" />
|
||||
<xsl:param name="varDataSourceId" />
|
||||
<xsl:param name="varFP7FundRefDOI" select="'10.13039/501100004963'"/>
|
||||
<xsl:param name="varFP7OtherDOI" select="'10.13039/100011102'"/>
|
||||
<xsl:param name="varH2020FundRefDOI" select="'10.13039/501100007601'"/>
|
||||
<xsl:param name="varFP7" select="'corda_______::'"/>
|
||||
<xsl:param name="varH2020" select="'corda__h2020::'"/>
|
||||
<xsl:param name="epmcUrlPrefix" select="'http://europepmc.org/articles/'" />
|
||||
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')"/>
|
||||
|
||||
<xsl:param name="index" select="0"/>
|
||||
<xsl:param name="transDate" select="current-dateTime()"/>
|
||||
<xsl:variable name="tf" select="TransformationFunction:getInstance()"/>
|
||||
<xsl:variable name="year" select="format-number( ( //*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='year'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='year']), '0000')" />
|
||||
<xsl:variable name="month" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='month'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='month']), '00')" />
|
||||
<xsl:variable name="day" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='day'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='day']), '00')" />
|
||||
|
||||
<xsl:template name="terminate">
|
||||
<xsl:message terminate="yes">
|
||||
record is not compliant, transformation is interrupted.
|
||||
</xsl:message>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="/">
|
||||
<record>
|
||||
<xsl:apply-templates select="//*[local-name() = 'header']" />
|
||||
<metadata>
|
||||
<xsl:if test="not(//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0])">
|
||||
<xsl:call-template name="terminate"/>
|
||||
</xsl:if>
|
||||
<!-- in journal.fi xml:lang of translated titles is not within the trans-title element but within the surrounding trans-title-group element (which just contains 1 trans-title element) -->
|
||||
<!--
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']//*[local-name()=('article-title', 'trans-title-group')][string-length(normalize-space(.))> 0]"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:title'"/>
|
||||
</xsl:call-template>
|
||||
-->
|
||||
<xsl:call-template name="title">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name()='title-group']//*[local-name()=('article-title', 'trans-title', 'subtitle', 'trans-subtitle')]"/>
|
||||
</xsl:call-template>
|
||||
|
||||
<xsl:call-template name="authors">
|
||||
<!--
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']/*[local-name() = 'contrib'][@contrib-type='author'][not(exists(child::*:collab))]"/>
|
||||
-->
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group'][@content-type='author']/*[local-name() = 'contrib']"/>
|
||||
</xsl:call-template>
|
||||
<!-- <xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//dc:contributor"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:contributor'"/>
|
||||
</xsl:call-template>
|
||||
-->
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='article-meta']/*[local-name()=('abstract', 'trans-abstract')]"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:description'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='article-categories']//*[local-name()='subject']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:subject'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='kwd-group']//*[local-name()='kwd']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:subject'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='publisher']/*[local-name()='publisher-name']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:publisher'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:source'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:element name="dc:language">
|
||||
<xsl:value-of select="//*[local-name()='metadata']//*[local-name()='article']/@xml:lang" />
|
||||
</xsl:element>
|
||||
<xsl:element name="dc:identifier">
|
||||
<xsl:value-of select="//*[local-name()='article-meta']/*[local-name()='self-uri'][contains(./@xlink:href, '/view/')]/@xlink:href" />
|
||||
</xsl:element>
|
||||
<xsl:element name="oaf:dateAccepted">
|
||||
<!--
|
||||
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day'])" />
|
||||
|
||||
<xsl:value-of select="TransformationFunction:Convert($tf, //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub'], 'DateISO8601', 'yyyy-MM-dd', 'min()')" />
|
||||
|
||||
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/replace(concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day']), '-(\d)([-$])', '-0$1$2')" />
|
||||
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/
|
||||
concat(./*[local-name()='year'], '-',
|
||||
substring(concat('0', ./*[local-name()='month'], '1'), string-length(./*[local-name()='month']), 2), '-',
|
||||
substring(concat('0', ./*[local-name()='day'], '1'), string-length(./*[local-name()='day']), 2))" />
|
||||
-->
|
||||
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/
|
||||
concat(./*[local-name()='year'], '-',
|
||||
substring(concat('0', ./*[local-name()='month'], '1'), string-length(./*[local-name()='month']) idiv 2 + 1, 2), '-',
|
||||
substring(concat('0', ./*[local-name()='day'], '1'), string-length(./*[local-name()='day']) idiv 2 +1, 2))" />
|
||||
|
||||
</xsl:element>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<xsl:for-each select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub']">
|
||||
<xsl:choose>
|
||||
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4] and ./*[local-name()='month' and string-length(normalize-space(.)) = 2] and ./*[local-name()='day' and string-length(normalize-space(.)) = 2]">
|
||||
<dc:date>
|
||||
<xsl:value-of select="concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day'])"/>
|
||||
</dc:date>
|
||||
</xsl:when>
|
||||
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4] and ./*[local-name()='month' and string-length(normalize-space(.)) = 2]">
|
||||
<dc:date>
|
||||
<xsl:value-of select="concat(./*[local-name()='year'], '-', ./*[local-name()='month'])"/>
|
||||
</dc:date>
|
||||
</xsl:when>
|
||||
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4]">
|
||||
<dc:date>
|
||||
<xsl:value-of select="./*[local-name()='year']"/>
|
||||
</dc:date>
|
||||
</xsl:when>
|
||||
</xsl:choose>
|
||||
</xsl:for-each>
|
||||
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()='meta-value'], //*[local-name()='permissions']/*[local-name()='copyright-statement']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:rights'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='permissions']/*[local-name()='license']/@xlink:href"/>
|
||||
<xsl:with-param name="targetElement" select="'oaf:license'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='fn-group']//*[local-name()='fn']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:relation'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="identifiers">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='article-id']"/>
|
||||
</xsl:call-template>
|
||||
|
||||
<xsl:for-each select="//*[local-name()='article-meta']/*[local-name()='self-uri'][not(./@content-type = 'application/pdf')]/@xlink:href">
|
||||
<oaf:identifier>
|
||||
<xsl:attribute name="identifierType">
|
||||
<xsl:text>landingPage</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="."/>
|
||||
</oaf:identifier>
|
||||
</xsl:for-each>
|
||||
<xsl:for-each select="//*[local-name()='article-meta']/*[local-name()='self-uri' and ./@content-type='application/pdf' and //oaf:datasourceprefix = ('ambientesust', 'qualityinedu')]/@xlink:href/replace(., '/view/', '/download/')">
|
||||
<oaf:fulltext>
|
||||
<xsl:value-of select="."/>
|
||||
</oaf:fulltext>
|
||||
</xsl:for-each>
|
||||
|
||||
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varFP7FundRefDOI) or ends-with(., $varFP7OtherDOI)]]">
|
||||
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
|
||||
<oaf:projectid>
|
||||
<xsl:value-of select="concat($varFP7, ./*[local-name()='award-id'])"/>
|
||||
</oaf:projectid>
|
||||
</xsl:if>
|
||||
</xsl:for-each>
|
||||
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varH2020FundRefDOI)]]">
|
||||
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
|
||||
<oaf:projectid>
|
||||
<xsl:value-of select="concat($varH2020, ./*[local-name()='award-id'])"/>
|
||||
</oaf:projectid>
|
||||
</xsl:if>
|
||||
</xsl:for-each>
|
||||
|
||||
<!-- -->
|
||||
<xsl:variable name='varRights' select="distinct-values((for $i in (
|
||||
//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()=('meta-value', 'meta-name')],
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'license']/@xlink:href,
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read'
|
||||
and not( ./@start_date[(xs:date( max( (string(.), '0001-01-01') ) ) gt current-date())])
|
||||
and not( ./@end_date[(xs:date( max( (string(.), '0001-01-01') ) ) lt current-date())])]/'open',
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read'
|
||||
and (( ./@start_date[(xs:date( max( (string(.), '0001-01-01') ) ) gt current-date())])
|
||||
or ( ./@end_date[(xs:date( max( (string(.), '0001-01-01') ) ) lt current-date())]))]/'embargo')
|
||||
return TransformationFunction:convertString($tf, normalize-space($i), 'AccessRights')))" />
|
||||
|
||||
<!--
|
||||
and not((xs:date( max( (start_date, '0001-01-01') ) ) gt current-date()))
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read' and and not((xs:date( max( (./@start_date, '0001-01-01') ) ) gt current-date()))]/'open'
|
||||
-->
|
||||
|
||||
<oaf:accessrights>
|
||||
<xsl:choose>
|
||||
<xsl:when test="$varRights[. = 'EMBARGO']">
|
||||
<xsl:value-of select="'EMBARGO'"/>
|
||||
</xsl:when>
|
||||
<xsl:when test="$varRights[. != 'UNKNOWN']">
|
||||
<xsl:value-of select="$varRights[. != 'UNKNOWN'][1]"/>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="$varRights[1]"/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</oaf:accessrights>
|
||||
|
||||
<!--
|
||||
<oaf:accessrights>
|
||||
<xsl:value-of select="$varRights[1]"/>
|
||||
</oaf:accessrights>
|
||||
|
||||
<xsl:element name="oaf:accessrights">
|
||||
<xsl:value-of select="(//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()=('meta-value', 'meta-name')],
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'license']/@xlink:href)/TransformationFunction:convertString($tf, ., 'AccessRights')" />
|
||||
</xsl:element>
|
||||
-->
|
||||
|
||||
<!--
|
||||
<xsl:element name="dr:CobjCategory">
|
||||
<xsl:variable name='varCobjCategory' select="TransformationFunction:convertString($tf, //*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()='meta-value'], 'TextTypologies')" />
|
||||
<xsl:variable name='varSuperType' select="TransformationFunction:convertString($tf, $varCobjCategory, 'SuperTypes')" />
|
||||
<xsl:attribute name="type" select="$varSuperType"/>
|
||||
<xsl:value-of select="$varCobjCategory" />
|
||||
</xsl:element>
|
||||
|
||||
<xsl:variable name='varCobjCatLst' select="for $i in (
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
|
||||
//*[local-name() = 'article']/@article-type)
|
||||
return TransformationFunction:convertString($tf, normalize-space($i), 'TextTypologies')" />
|
||||
-->
|
||||
|
||||
<xsl:variable name='varTypLst' select="distinct-values((//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
|
||||
//*[local-name() = 'article']/@article-type))"/>
|
||||
<xsl:variable name='varCobjCatLst' select="distinct-values((for $i in $varTypLst
|
||||
return TransformationFunction:convertString($tf, normalize-space($i), 'TextTypologies')))" />
|
||||
<xsl:variable name='varCobjSupLst' select="for $i in $varCobjCatLst
|
||||
return concat($i, '###', TransformationFunction:convertString($tf, normalize-space($i), 'SuperTypes'))" />
|
||||
<dr:CobjCategory>
|
||||
<xsl:choose>
|
||||
<xsl:when test="count($varCobjSupLst[not(substring-after(., '###') = 'other') and not(substring-before(., '###') = ('0038', '0039', '0040'))]) > 0">
|
||||
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-after(., '###') = 'other') and not(substring-before(., '###') = ('0038', '0039', '0040'))][1]" />
|
||||
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
|
||||
<xsl:value-of select="substring-before($varCobjSup, '###')" />
|
||||
</xsl:when>
|
||||
<xsl:when test="count($varCobjSupLst[not(substring-after(., '###') = 'other')]) > 0">
|
||||
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-after(., '###') = 'other')][1]" />
|
||||
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
|
||||
<xsl:value-of select="substring-before($varCobjSup, '###')" />
|
||||
</xsl:when>
|
||||
<xsl:when test="count($varCobjSupLst[not(substring-before(., '###') = ('0020', '0000'))]) > 0">
|
||||
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-before(., '###') = ('0020', '0000'))][1]" />
|
||||
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
|
||||
<xsl:value-of select="substring-before($varCobjSup, '###')" />
|
||||
</xsl:when>
|
||||
<xsl:when test="count($varCobjSupLst[not(substring-before(., '###') = ('0000'))]) > 0">
|
||||
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-before(., '###') = ('0000'))][1]" />
|
||||
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
|
||||
<xsl:value-of select="substring-before($varCobjSup, '###')" />
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:attribute name="type" select="'other'"/>
|
||||
<xsl:value-of select="'0000'" />
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</dr:CobjCategory>
|
||||
|
||||
<!--
|
||||
<xsl:for-each select="$varCobjSupLst">
|
||||
<dc:type>
|
||||
<xsl:value-of select="."/>
|
||||
</dc:type>
|
||||
</xsl:for-each>
|
||||
-->
|
||||
|
||||
<xsl:for-each select="$varTypLst">
|
||||
<dc:type>
|
||||
<xsl:value-of select="."/>
|
||||
</dc:type>
|
||||
</xsl:for-each>
|
||||
|
||||
<!--
|
||||
<xsl:for-each select="(//*[local-name()='article']/@article-type, //*[local-name() = 'custom-meta' and ./@specific-use = 'resource-type']/*[local-name() = ('meta-value', 'meta-name')])">
|
||||
<dc:type>
|
||||
<xsl:value-of select="."/>
|
||||
</dc:type>
|
||||
</xsl:for-each>
|
||||
-->
|
||||
|
||||
<oaf:language>
|
||||
<xsl:value-of select="TransformationFunction:convertString($tf, //*[local-name()='metadata']//*[local-name()='article']/@xml:lang, 'Languages')" />
|
||||
</oaf:language>
|
||||
|
||||
<!-- review status -->
|
||||
<!-- ToDo:
|
||||
review status
|
||||
~ ask Journal.fi to put it elsewhere
|
||||
~ evaluate article-version (no example found yet)
|
||||
subject/kwd:
|
||||
~ handle thesauri (no example found yet)
|
||||
relations:
|
||||
~ handle fn (no example found yet)
|
||||
-->
|
||||
<!--
|
||||
<xsl:variable name="varRefereedConvt" select="for $i in (
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
|
||||
//*[local-name() = 'article']/@article-type)
|
||||
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
|
||||
-->
|
||||
|
||||
<xsl:variable name="varRefereedConvt" select="for $i in ($varTypLst)
|
||||
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
|
||||
<xsl:variable name="varRefereedDescp" select="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]/'0001'"/>
|
||||
<xsl:variable name="varRefereedSubjt" select="//*[local-name() = 'article-categories' and contains(//dri:recordIdentifier, 'oai:journal.fi')]/*[local-name() = 'subj-group' and ./@subj-group-type='heading']/*[local-name() = 'subject' and . = 'Peer reviewed articles']/'0001'"/>
|
||||
<xsl:variable name="varRefereed" select="($varRefereedConvt, $varRefereedDescp, $varRefereedSubjt)"/>
|
||||
<!--
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="$varRefereedDescp"/>
|
||||
</oaf:refereed>
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="$varRefereed"/>
|
||||
</oaf:refereed>
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="count($varRefereed[. = '0001']) > 0"/>
|
||||
</oaf:refereed>
|
||||
-->
|
||||
<xsl:choose>
|
||||
<xsl:when test="count($varRefereed[. = '0001']) > 0">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="count($varRefereed[. = '0002']) > 0">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0002'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
</xsl:choose>
|
||||
|
||||
<xsl:call-template name="journal">
|
||||
<xsl:with-param name="journalTitle" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
|
||||
<xsl:with-param name="issn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='ppub']"/>
|
||||
<xsl:with-param name="eissn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='epub']"/>
|
||||
<xsl:with-param name="vol" select="//*[local-name()='article-meta']/*[local-name()='volume']"/>
|
||||
<xsl:with-param name="issue" select="//*[local-name()='article-meta']/*[local-name()='issue']"/>
|
||||
<xsl:with-param name="sp" select="//*[local-name()='article-meta']/*[local-name()='fpage']"/>
|
||||
<xsl:with-param name="ep" select="//*[local-name()='article-meta']/*[local-name()='lpage']"/>
|
||||
</xsl:call-template>
|
||||
<oaf:hostedBy>
|
||||
<xsl:attribute name="name">
|
||||
<xsl:value-of select="$varOfficialName"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="$varDataSourceId"/>
|
||||
</xsl:attribute>
|
||||
</oaf:hostedBy>
|
||||
<oaf:collectedFrom>
|
||||
<xsl:attribute name="name">
|
||||
<xsl:value-of select="$varOfficialName"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="$varDataSourceId"/>
|
||||
</xsl:attribute>
|
||||
</oaf:collectedFrom>
|
||||
</metadata>
|
||||
<xsl:copy-of select="//*[local-name() = 'about']" />
|
||||
</record>
|
||||
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="allElements">
|
||||
<xsl:param name="sourceElement"/>
|
||||
<xsl:param name="targetElement"/>
|
||||
<xsl:for-each select="$sourceElement">
|
||||
<xsl:element name="{$targetElement}">
|
||||
<xsl:if test="(.[@xml:lang] or ..[@xml:lang]) and $targetElement = ('dc:title', 'dc:description', 'dc:subject')">
|
||||
<xsl:attribute name="xml:lang">
|
||||
<xsl:value-of select="(./@xml:lang, ../@xml:lang)[1]"/>
|
||||
</xsl:attribute>
|
||||
</xsl:if>
|
||||
<xsl:value-of select="normalize-space(.)"/>
|
||||
</xsl:element>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="title">
|
||||
<xsl:param name="sourceElement"/>
|
||||
<xsl:for-each select="$sourceElement">
|
||||
<xsl:element name="dc:title">
|
||||
<xsl:if test=".[@xml:lang] or ..[@xml:lang]">
|
||||
<xsl:attribute name="xml:lang">
|
||||
<xsl:value-of select="(./@xml:lang, ../@xml:lang)[1]"/>
|
||||
</xsl:attribute>
|
||||
</xsl:if>
|
||||
<xsl:value-of select="string-join((., ./following-sibling::*[local-name() = ('subtitle', 'trans-subtitle')])/normalize-space(.), ': ')"/>
|
||||
</xsl:element>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="journal">
|
||||
<xsl:param name="journalTitle"/>
|
||||
<xsl:param name="issn"/>
|
||||
<xsl:param name="eissn"/>
|
||||
<xsl:param name="vol"/>
|
||||
<xsl:param name="issue"/>
|
||||
<xsl:param name="sp"/>
|
||||
<xsl:param name="ep"/>
|
||||
<xsl:element name="oaf:journal">
|
||||
<xsl:attribute name="issn">
|
||||
<xsl:value-of select="normalize-space($issn)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="eissn">
|
||||
<xsl:value-of select="normalize-space($eissn)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="vol">
|
||||
<xsl:value-of select="normalize-space($vol)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="iss">
|
||||
<xsl:value-of select="normalize-space($issue)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="sp">
|
||||
<xsl:value-of select="normalize-space($sp)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="ep">
|
||||
<xsl:value-of select="normalize-space($ep)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="normalize-space($journalTitle)"/>
|
||||
</xsl:element>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
<xsl:template name="identifiers">
|
||||
<xsl:param name="sourceElement"/>
|
||||
<xsl:if test="string-length($sourceElement[@pub-id-type='doi']) gt 0">
|
||||
<xsl:element name="oaf:identifier">
|
||||
<xsl:attribute name="identifierType">
|
||||
<xsl:text>doi</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="$sourceElement[@pub-id-type='doi']"/>
|
||||
</xsl:element>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
<xsl:template name="authors">
|
||||
<xsl:param name="sourceElement"/>
|
||||
<xsl:for-each select="$sourceElement">
|
||||
<xsl:element name="dc:creator">
|
||||
<xsl:if test="./*[local-name()='contrib-id'][@contrib-id-type='orcid']">
|
||||
<xsl:attribute name="nameIdentifierScheme">
|
||||
<xsl:text>ORCID</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="schemeURI">
|
||||
<xsl:text>http://orcid.org/</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="nameIdentifier">
|
||||
<xsl:value-of select="substring-after(./*[local-name()='contrib-id'][@contrib-id-type='orcid'], 'http://orcid.org/')"/>
|
||||
</xsl:attribute>
|
||||
</xsl:if>
|
||||
<xsl:value-of select="concat(normalize-space(./*[local-name()='name']/*[local-name()='surname']), ', ', normalize-space(./*[local-name()='name']/*[local-name()='given-names']))"/>
|
||||
</xsl:element>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
|
||||
<xsl:template match="//*[local-name() = 'header']">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*"/>
|
||||
<xsl:element name="dr:dateOfTransformation">
|
||||
<xsl:value-of select="$transDate"/>
|
||||
</xsl:element>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
<xsl:template match="node()|@*">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
|
@ -0,0 +1,437 @@
|
|||
<!-- from production 2021-0614 -->
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.1"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:transformExt="http://namespace.openaire.eu/java/org.apache.commons.codec.digest.DigestUtils"
|
||||
xmlns:TransformationFunction="eu.dnetlib.data.collective.transformation.core.xsl.ext.TransformationFunctionProxy"
|
||||
extension-element-prefixes="transformExt TransformationFunction"
|
||||
exclude-result-prefixes="transformExt TransformationFunction" >
|
||||
<xsl:output indent="yes" omit-xml-declaration="yes"/>
|
||||
|
||||
<xsl:param name="varHostedById" select="'opendoar____::908'"/>
|
||||
<xsl:param name="varHostedByName" select="'Europe PubMed Central'"/>
|
||||
|
||||
<xsl:param name="varOfficialName" />
|
||||
<xsl:param name="varDsType" />
|
||||
<xsl:param name="varDataSourceId" />
|
||||
<xsl:param name="varFP7FundRefDOI" select="'10.13039/501100004963'"/>
|
||||
<xsl:param name="varH2020FundRefDOI" select="'10.13039/501100007601'"/>
|
||||
<xsl:param name="varFP7" select="'corda_______::'"/>
|
||||
<xsl:param name="varH2020" select="'corda__h2020::'"/>
|
||||
<xsl:param name="epmcUrlPrefix" select="'http://europepmc.org/articles/'" />
|
||||
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')"/>
|
||||
|
||||
<xsl:param name="index" select="0"/>
|
||||
<xsl:param name="transDate" select="current-dateTime()"/>
|
||||
<xsl:variable name="tf" select="TransformationFunction:getInstance()"/>
|
||||
<xsl:variable name="year" select="format-number( ( //*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='year'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='year']), '0000')" />
|
||||
<xsl:variable name="month" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='month'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='month']), '00')" />
|
||||
<xsl:variable name="day" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='day'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='day']), '00')" />
|
||||
|
||||
<xsl:template name="terminate">
|
||||
<xsl:message terminate="yes">
|
||||
record is not compliant, transformation is interrupted.
|
||||
</xsl:message>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="/">
|
||||
<record>
|
||||
<xsl:apply-templates select="//*[local-name() = 'header']" />
|
||||
<metadata>
|
||||
<xsl:if test="not(//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0])">
|
||||
<xsl:call-template name="terminate"/>
|
||||
</xsl:if>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0]"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:title'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="authors">
|
||||
<!--
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'contrib'][@contrib-type='author']"/>
|
||||
-->
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']/*[local-name() = 'contrib'][@contrib-type='author'][not(exists(child::*:collab))][./*[local-name()='name'] or ./*[local-name()='name-alternatives']/*[local-name()='name']][string-length(.//*[local-name()='surname']) + string-length(.//*[local-name()='given-names']) > 0]"/>
|
||||
</xsl:call-template>
|
||||
<!-- <xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//dc:contributor"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:contributor'"/>
|
||||
</xsl:call-template>
|
||||
-->
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='article-meta']/*[local-name()='abstract']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:description'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='article-categories']//*[local-name()='subject']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:subject'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='kwd-group' and not(lower-case(@kwd-group-type)=('mesh', 'ocis'))]//*[local-name()='kwd']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:subject'"/>
|
||||
</xsl:call-template>
|
||||
|
||||
<xsl:for-each select="//*[local-name()='kwd-group' and lower-case(@kwd-group-type)='mesh' and ./*[local-name()='kwd']]">
|
||||
<xsl:for-each select="./*[local-name()='kwd']">
|
||||
<dc:subject>
|
||||
<xsl:attribute name="subjectScheme" select="'mesh'"/>
|
||||
<xsl:attribute name="schemeURI" select="'http://www.nlm.nih.gov/mesh/'"/>
|
||||
<xsl:attribute name="valueURI" select="''"/>
|
||||
<xsl:value-of select="./concat('mesh:', replace(., 'mesh (.*)$', '$1'))"/>
|
||||
</dc:subject>
|
||||
</xsl:for-each>
|
||||
</xsl:for-each>
|
||||
<xsl:for-each select="//*[local-name()='kwd-group' and lower-case(@kwd-group-type)='ocis' and ./*[local-name()='kwd']]">
|
||||
<xsl:for-each select="./*[local-name()='kwd']">
|
||||
<dc:subject>
|
||||
<xsl:attribute name="subjectScheme" select="'ocis'"/>
|
||||
<xsl:attribute name="schemeURI" select="''"/>
|
||||
<xsl:attribute name="valueURI" select="''"/>
|
||||
<xsl:value-of select="./concat('ocis:', .)"/>
|
||||
</dc:subject>
|
||||
</xsl:for-each>
|
||||
</xsl:for-each>
|
||||
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='publisher']/*[local-name()='publisher-name']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:publisher'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:source'"/>
|
||||
</xsl:call-template>
|
||||
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/(*[local-name() = 'article-version-alternatives']/*[local-name() = 'article-version'], *[local-name() = 'article-version'])/concat('article-version (', @article-version-type, ') ', .)"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:source'"/>
|
||||
</xsl:call-template>
|
||||
|
||||
<xsl:element name="dc:language">
|
||||
<xsl:text>eng</xsl:text>
|
||||
</xsl:element>
|
||||
<xsl:element name="dc:identifier">
|
||||
<xsl:value-of select="concat($epmcUrlPrefix, //*[local-name()='article-id'][@pub-id-type='pmcid'])" />
|
||||
</xsl:element>
|
||||
<xsl:element name="oaf:fulltext">
|
||||
<xsl:value-of select="concat($epmcUrlPrefix, //*[local-name()='article-id'][@pub-id-type='pmcid'])" />
|
||||
</xsl:element>
|
||||
<xsl:element name="oaf:dateAccepted">
|
||||
<xsl:choose>
|
||||
<xsl:when test="//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub'] or //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']" >
|
||||
<xsl:if test="string(number($month)) eq 'NaN'" >
|
||||
<xsl:value-of select="concat($year, '-', '01', '-', '01')" />
|
||||
</xsl:if>
|
||||
<xsl:if test="string(number($month)) != 'NaN'" >
|
||||
<xsl:value-of select="concat($year, '-', $month, '-', '01')" />
|
||||
</xsl:if>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="concat(//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='ppub']/*[local-name()='year'], '-01-01')" />
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:element>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="distinct-values(//*[local-name()='permissions']/*[local-name()='copyright-statement'])"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:rights'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="distinct-values(//*[local-name()='permissions']/*[local-name()='license'])"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:rights'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='fn-group']//*[local-name()='fn']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:relation'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="identifiers">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='article-id']"/>
|
||||
</xsl:call-template>
|
||||
|
||||
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varFP7FundRefDOI)]]">
|
||||
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
|
||||
<oaf:projectid>
|
||||
<xsl:value-of select="concat($varFP7, ./*[local-name()='award-id'])"/>
|
||||
</oaf:projectid>
|
||||
</xsl:if>
|
||||
</xsl:for-each>
|
||||
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varH2020FundRefDOI)]]">
|
||||
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
|
||||
<oaf:projectid>
|
||||
<xsl:value-of select="concat($varH2020, ./*[local-name()='award-id'])"/>
|
||||
</oaf:projectid>
|
||||
</xsl:if>
|
||||
</xsl:for-each>
|
||||
|
||||
<xsl:element name="oaf:accessrights">
|
||||
<xsl:text>OPEN</xsl:text>
|
||||
</xsl:element>
|
||||
|
||||
<xsl:element name="dr:CobjCategory">
|
||||
<xsl:attribute name="type" select="'publication'"/>
|
||||
<xsl:text>0001</xsl:text>
|
||||
</xsl:element>
|
||||
|
||||
<dc:type>
|
||||
<xsl:value-of select="//*[local-name() = 'article']/@article-type"/>
|
||||
</dc:type>
|
||||
|
||||
<!-- custom-meta perhaps not used for types, then drop
|
||||
<xsl:variable name='varTypLst' select="distinct-values((//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
|
||||
//*[local-name() = 'article']/@article-type))"/>
|
||||
<xsl:variable name='varTypLst' select="//*[local-name() = 'article']/@article-type"/>
|
||||
-->
|
||||
<!-- perhaps ensure that file indeed exists, e.g. as pdf etc -->
|
||||
<!--
|
||||
// reduce load for the big PubMed records by exchanging variables with choose
|
||||
<xsl:variable name="varRefereedConvt" select="for $i in distinct-values((//*[local-name() = 'article']/@article-type, //oai:setSpec))
|
||||
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
|
||||
<xsl:variable name="varRefereedDescp" select="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]/'0001'"/>
|
||||
<xsl:variable name="varRefereedFnote" select="//*[local-name() = 'article']/*[local-name() = ('back', 'front')]/*[local-name() = ('fn-group', 'notes')][
|
||||
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*') or
|
||||
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*review\s*information.*') or
|
||||
matches(lower-case(.), '.*the\s*peer[\.\-_/\s\(\)]*review\s*history\s*for\s*this\s*article\s*is\s*available\s*at .*') or
|
||||
matches(lower-case(.), '.*provenance\s*and\s*peer[\.\-_/\s\(\)]*review.*') or
|
||||
matches(lower-case(.), '.*externally\s*peer[\.\-_/\s\(\)]*reviewed.*') or
|
||||
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*reviewed\s*by.*') or
|
||||
matches(lower-case(.), '.*refereed\s*anonymously.*') or
|
||||
matches(lower-case(.), '.*peer\s*reviewer\s*reports\s*are\s*available.*')
|
||||
]/'0001'"/>
|
||||
<xsl:variable name="varRefereedReviw" select="//*[local-name() = ('article-meta', 'app', 'app-group')]/*[local-name() = 'supplementary-material']/*[local-name() = 'media'][
|
||||
matches(lower-case(.), '.*peer\s*review\s*file.*')]/'0001'"/>
|
||||
<xsl:variable name="varRefereedReltn" select="//*[local-name() = ('related-article')][./@related-article-type = ('peer-reviewed-article', 'reviewed-article')]/'0002'"/>
|
||||
<xsl:variable name="varRefereedCtRol" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']
|
||||
[./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or
|
||||
./*[local-name() = 'contrib'][./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or ./*[local-name() = 'role' and lower-case(.) = ('reviewer', 'solicited external reviewer')] or ./@contrib-type/lower-case(.) = 'reviewer']]/'0001'"/>
|
||||
<xsl:variable name="varRefereedVersn" select="//*[local-name() = 'article-meta'][./*[local-name() = 'article-version-alternatives']/*[local-name() = 'article-version' and . = 'preprint'] or ./*[local-name() = 'article-version' and . = 'preprint']]/'0002'"/>
|
||||
<xsl:variable name="varRefereed" select="($varRefereedConvt, $varRefereedDescp, $varRefereedFnote, $varRefereedReviw, $varRefereedReltn, $varRefereedCtRol, $varRefereedVersn)"/>
|
||||
<xsl:choose>
|
||||
<xsl:when test="count($varRefereed[. = '0001']) > 0">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="count($varRefereed[. = '0002']) > 0">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0002'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
</xsl:choose>
|
||||
-->
|
||||
<xsl:variable name="varRefereedConvt" select="for $i in distinct-values((//*[local-name() = 'article']/@article-type, //oai:setSpec))
|
||||
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
|
||||
<xsl:choose>
|
||||
<xsl:when test="count($varRefereedConvt[. = '0001']) > 0">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
|
||||
<xsl:when test="//*[local-name() = 'article-meta']/*[local-name() = 'article-id'][@pub-id-type='doi'][matches(., '^(https?://(dx\.)?doi.org/)?10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$')]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
|
||||
<xsl:when test="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = 'article']/*[local-name() = ('back', 'front')]/*[local-name() = ('fn-group', 'notes')][
|
||||
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*') or
|
||||
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*review\s*information.*') or
|
||||
matches(lower-case(.), '.*the\s*peer[\.\-_/\s\(\)]*review\s*history\s*for\s*this\s*article\s*is\s*available\s*at .*') or
|
||||
matches(lower-case(.), '.*provenance\s*and\s*peer[\.\-_/\s\(\)]*review.*') or
|
||||
matches(lower-case(.), '.*externally\s*peer[\.\-_/\s\(\)]*reviewed.*') or
|
||||
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*reviewed\s*by.*') or
|
||||
matches(lower-case(.), '.*refereed\s*anonymously.*') or
|
||||
matches(lower-case(.), '.*peer\s*reviewer\s*reports\s*are\s*available.*') or
|
||||
matches(lower-case(.), '.*\[.*peer[\s\-\._]*review\s*:.*\].*') or
|
||||
matches(lower-case(.), '.*\[.*referees\s*:.*\].*') or
|
||||
matches(lower-case(.), '^\s*plagiarism[\s\-\._]check.*') or
|
||||
matches(lower-case(.), '^\s*peer[\s\-\._]*review.*') or
|
||||
matches(lower-case(.), '^\s*(open\s*peer[\s\-\._]*|p-)reviewer.*') or
|
||||
matches(lower-case(.), '^\s*(open\s*peer[\s\-\._]*|p-)review\s*reports?.*')]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = ('article-meta', 'app', 'app-group')]/*[local-name() = 'supplementary-material']/*[local-name() = 'media'][
|
||||
matches(lower-case(.), '.*peer\s*review\s*file.*')]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']
|
||||
[./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or
|
||||
./*[local-name() = 'contrib'][./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or ./*[local-name() = 'role' and lower-case(.) = ('reviewer', 'solicited external reviewer')] or ./@contrib-type/lower-case(.) = 'reviewer']]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="count($varRefereedConvt[. = '0002']) > 0">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0002'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = ('related-article')][./@related-article-type = ('peer-reviewed-article', 'reviewed-article')]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0002'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = 'article-meta'][./*[local-name() = 'article-version-alternatives']/*[local-name() = 'article-version' and . = 'preprint'] or ./*[local-name() = 'article-version' and . = 'preprint']]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0002'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
</xsl:choose>
|
||||
|
||||
|
||||
<xsl:call-template name="journal">
|
||||
<xsl:with-param name="journalTitle" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
|
||||
<xsl:with-param name="issn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='ppub']"/>
|
||||
<xsl:with-param name="eissn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='epub']"/>
|
||||
<xsl:with-param name="vol" select="//*[local-name()='article-meta']/*[local-name()='volume']"/>
|
||||
<xsl:with-param name="issue" select="//*[local-name()='article-meta']/*[local-name()='issue']"/>
|
||||
<xsl:with-param name="sp" select="//*[local-name()='article-meta']/*[local-name()='fpage']"/>
|
||||
<xsl:with-param name="ep" select="//*[local-name()='article-meta']/*[local-name()='lpage']"/>
|
||||
</xsl:call-template>
|
||||
<oaf:hostedBy>
|
||||
<xsl:attribute name="name">
|
||||
<xsl:value-of select="$varHostedByName"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="$varHostedById"/>
|
||||
</xsl:attribute>
|
||||
</oaf:hostedBy>
|
||||
<oaf:collectedFrom>
|
||||
<xsl:attribute name="name">
|
||||
<xsl:value-of select="$varOfficialName"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="$varDataSourceId"/>
|
||||
</xsl:attribute>
|
||||
</oaf:collectedFrom>
|
||||
|
||||
<xsl:for-each select="//*[local-name() = 'article']/*[local-name() = ('back', 'front')]/*[local-name() = 'fn-group']/*[local-name() = 'fn'][matches(lower-case(.), 'country(/territory)? of origin:?\s*[A-Za-z\-]+')]">
|
||||
<oaf:country>
|
||||
<!--
|
||||
<xsl:value-of select="TransformationFunction:convertString($tf, replace(lower-case(.), '^(.|\s)*country(/territory)? of origin:?\s+([A-Za-z\-,\(\)]+(\s+[A-Za-z\-,\(\)]+)*)(.|\s)*$', '$3'), 'Countries')"/>
|
||||
-->
|
||||
<xsl:value-of select="TransformationFunction:convertString($tf, normalize-space(substring(substring-after(lower-case(.), 'of origin'), 2)), 'Countries')"/>
|
||||
</oaf:country>
|
||||
</xsl:for-each>
|
||||
|
||||
|
||||
</metadata>
|
||||
<xsl:copy-of select="//*[local-name() = 'about']" />
|
||||
</record>
|
||||
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="allElements">
|
||||
<xsl:param name="sourceElement"/>
|
||||
<xsl:param name="targetElement"/>
|
||||
<xsl:for-each select="$sourceElement">
|
||||
<xsl:element name="{$targetElement}">
|
||||
<xsl:value-of select="normalize-space(.)"/>
|
||||
</xsl:element>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="journal">
|
||||
<xsl:param name="journalTitle"/>
|
||||
<xsl:param name="issn"/>
|
||||
<xsl:param name="eissn"/>
|
||||
<xsl:param name="vol"/>
|
||||
<xsl:param name="issue"/>
|
||||
<xsl:param name="sp"/>
|
||||
<xsl:param name="ep"/>
|
||||
<xsl:element name="oaf:journal">
|
||||
<xsl:attribute name="issn">
|
||||
<xsl:value-of select="normalize-space($issn)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="eissn">
|
||||
<xsl:value-of select="normalize-space($eissn)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="vol">
|
||||
<xsl:value-of select="normalize-space($vol)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="iss">
|
||||
<xsl:value-of select="normalize-space($issue)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="sp">
|
||||
<xsl:value-of select="normalize-space($sp)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="ep">
|
||||
<xsl:value-of select="normalize-space($ep)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="normalize-space($journalTitle)"/>
|
||||
</xsl:element>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
<xsl:template name="identifiers">
|
||||
<xsl:param name="sourceElement"/>
|
||||
<xsl:element name="oaf:identifier">
|
||||
<xsl:attribute name="identifierType">
|
||||
<xsl:text>doi</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="$sourceElement[@pub-id-type='doi']"/>
|
||||
</xsl:element>
|
||||
<xsl:element name="oaf:identifier">
|
||||
<xsl:attribute name="identifierType">
|
||||
<xsl:text>pmc</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="$sourceElement[@pub-id-type='pmcid']"/>
|
||||
</xsl:element>
|
||||
<xsl:element name="oaf:identifier">
|
||||
<xsl:attribute name="identifierType">
|
||||
<xsl:text>pmid</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="$sourceElement[@pub-id-type='pmid']"/>
|
||||
</xsl:element>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
<xsl:template name="authors">
|
||||
<xsl:param name="sourceElement"/>
|
||||
<xsl:for-each select="$sourceElement">
|
||||
<xsl:element name="dc:creator">
|
||||
<xsl:if test="./*[local-name()='contrib-id'][@contrib-id-type='orcid']">
|
||||
<xsl:attribute name="nameIdentifierScheme">
|
||||
<xsl:text>ORCID</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="schemeURI">
|
||||
<xsl:text>http://orcid.org/</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="nameIdentifier">
|
||||
<xsl:value-of select="substring-after(./*[local-name()='contrib-id'][@contrib-id-type='orcid'], 'http://orcid.org/')"/>
|
||||
</xsl:attribute>
|
||||
</xsl:if>
|
||||
<!--
|
||||
<xsl:value-of select="concat(normalize-space(./*[local-name()='name']/*[local-name()='surname']), ', ', normalize-space(./*[local-name()='name']/*[local-name()='given-names']))"/>
|
||||
-->
|
||||
<xsl:value-of select="concat(normalize-space(./(*[local-name()='name'], *[local-name()='name-alternatives']/*[local-name()='name'])/*[local-name()='surname']), ', ', normalize-space(./(*[local-name()='name'], *[local-name()='name-alternatives']/*[local-name()='name'])/*[local-name()='given-names']))"/>
|
||||
</xsl:element>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
|
||||
<xsl:template match="//*[local-name() = 'header']">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*"/>
|
||||
<xsl:element name="dr:dateOfTransformation">
|
||||
<xsl:value-of select="$transDate"/>
|
||||
</xsl:element>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
<xsl:template match="node()|@*">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
|
@ -0,0 +1,493 @@
|
|||
<!-- from PROD 2021-06-14 -->
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
|
||||
xmlns:vocabulary="http://eu/dnetlib/transform/clean"
|
||||
xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:datacite="http://datacite.org/schema/kernel-4"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
exclude-result-prefixes="xsl vocabulary dateCleaner"
|
||||
version="2.0">
|
||||
|
||||
<!--
|
||||
<xsl:param name="varHostedById" select="'opendoar____::908'"/>
|
||||
<xsl:param name="varHostedByName" select="'Europe PubMed Central'"/>
|
||||
-->
|
||||
|
||||
<xsl:param name="varOfficialName" />
|
||||
<xsl:param name="varDsType" />
|
||||
<xsl:param name="varDataSourceId" />
|
||||
<xsl:param name="varFP7FundRefDOI" select="'10.13039/501100004963'"/>
|
||||
<xsl:param name="varFP7OtherDOI" select="'10.13039/100011102'"/>
|
||||
<xsl:param name="varH2020FundRefDOI" select="'10.13039/501100007601'"/>
|
||||
<xsl:param name="varFP7" select="'corda_______::'"/>
|
||||
<xsl:param name="varH2020" select="'corda__h2020::'"/>
|
||||
<xsl:param name="epmcUrlPrefix" select="'http://europepmc.org/articles/'" />
|
||||
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')"/>
|
||||
|
||||
<xsl:param name="index" select="0"/>
|
||||
<xsl:param name="transDate" select="current-dateTime()"/>
|
||||
<xsl:variable name="year" select="format-number( ( //*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='year'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='year']), '0000')" />
|
||||
<xsl:variable name="month" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='month'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='month']), '00')" />
|
||||
<xsl:variable name="day" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='day'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='day']), '00')" />
|
||||
|
||||
<xsl:template name="terminate">
|
||||
<xsl:message terminate="yes">
|
||||
record is not compliant, transformation is interrupted.
|
||||
</xsl:message>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="/">
|
||||
<record>
|
||||
<xsl:apply-templates select="//*[local-name() = 'header']" />
|
||||
<metadata>
|
||||
<xsl:if test="not(//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0])">
|
||||
<xsl:call-template name="terminate"/>
|
||||
</xsl:if>
|
||||
<!-- in journal.fi xml:lang of translated titles is not within the trans-title element but within the surrounding trans-title-group element (which just contains 1 trans-title element) -->
|
||||
<!--
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']//*[local-name()=('article-title', 'trans-title-group')][string-length(normalize-space(.))> 0]"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:title'"/>
|
||||
</xsl:call-template>
|
||||
-->
|
||||
<xsl:call-template name="title">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name()='title-group']//*[local-name()=('article-title', 'trans-title', 'subtitle', 'trans-subtitle')]"/>
|
||||
</xsl:call-template>
|
||||
|
||||
<xsl:call-template name="authors">
|
||||
<!--
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']/*[local-name() = 'contrib'][@contrib-type='author'][not(exists(child::*:collab))]"/>
|
||||
-->
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group'][@content-type='author']/*[local-name() = 'contrib']"/>
|
||||
</xsl:call-template>
|
||||
<!-- <xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//dc:contributor"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:contributor'"/>
|
||||
</xsl:call-template>
|
||||
-->
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='article-meta']/*[local-name()=('abstract', 'trans-abstract')]"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:description'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='article-categories']//*[local-name()='subject']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:subject'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='kwd-group']//*[local-name()='kwd']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:subject'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='publisher']/*[local-name()='publisher-name']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:publisher'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:source'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:element name="dc:language">
|
||||
<xsl:value-of select="//*[local-name()='metadata']//*[local-name()='article']/@xml:lang" />
|
||||
</xsl:element>
|
||||
<xsl:element name="dc:identifier">
|
||||
<xsl:value-of select="//*[local-name()='article-meta']/*[local-name()='self-uri'][contains(./@xlink:href, '/view/')]/@xlink:href" />
|
||||
</xsl:element>
|
||||
<xsl:element name="oaf:dateAccepted">
|
||||
<!--
|
||||
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day'])" />
|
||||
|
||||
<xsl:value-of select="TransformationFunction:Convert($tf, //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub'], 'DateISO8601', 'yyyy-MM-dd', 'min()')" />
|
||||
|
||||
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/replace(concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day']), '-(\d)([-$])', '-0$1$2')" />
|
||||
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/
|
||||
concat(./*[local-name()='year'], '-',
|
||||
substring(concat('0', ./*[local-name()='month'], '1'), string-length(./*[local-name()='month']), 2), '-',
|
||||
substring(concat('0', ./*[local-name()='day'], '1'), string-length(./*[local-name()='day']), 2))" />
|
||||
-->
|
||||
<xsl:value-of select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub' and string-length(normalize-space(./*[local-name()='year'])) = 4]/
|
||||
concat(./*[local-name()='year'], '-',
|
||||
substring(concat('0', ./*[local-name()='month'], '1'), string-length(./*[local-name()='month']) idiv 2 + 1, 2), '-',
|
||||
substring(concat('0', ./*[local-name()='day'], '1'), string-length(./*[local-name()='day']) idiv 2 +1, 2))" />
|
||||
|
||||
</xsl:element>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<xsl:for-each select="//*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='epub']">
|
||||
<xsl:choose>
|
||||
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4] and ./*[local-name()='month' and string-length(normalize-space(.)) = 2] and ./*[local-name()='day' and string-length(normalize-space(.)) = 2]">
|
||||
<dc:date>
|
||||
<xsl:value-of select="concat(./*[local-name()='year'], '-', ./*[local-name()='month'], '-', ./*[local-name()='day'])"/>
|
||||
</dc:date>
|
||||
</xsl:when>
|
||||
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4] and ./*[local-name()='month' and string-length(normalize-space(.)) = 2]">
|
||||
<dc:date>
|
||||
<xsl:value-of select="concat(./*[local-name()='year'], '-', ./*[local-name()='month'])"/>
|
||||
</dc:date>
|
||||
</xsl:when>
|
||||
<xsl:when test="./*[local-name()='year' and string-length(normalize-space(.)) = 4]">
|
||||
<dc:date>
|
||||
<xsl:value-of select="./*[local-name()='year']"/>
|
||||
</dc:date>
|
||||
</xsl:when>
|
||||
</xsl:choose>
|
||||
</xsl:for-each>
|
||||
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()='meta-value'], //*[local-name()='permissions']/*[local-name()='copyright-statement']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:rights'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='permissions']/*[local-name()='license']/@xlink:href"/>
|
||||
<xsl:with-param name="targetElement" select="'oaf:license'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='fn-group']//*[local-name()='fn']"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:relation'"/>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="identifiers">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='article-id']"/>
|
||||
</xsl:call-template>
|
||||
|
||||
<xsl:for-each select="//*[local-name()='article-meta']/*[local-name()='self-uri'][not(./@content-type = 'application/pdf')]/@xlink:href">
|
||||
<oaf:identifier>
|
||||
<xsl:attribute name="identifierType">
|
||||
<xsl:text>landingPage</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="."/>
|
||||
</oaf:identifier>
|
||||
</xsl:for-each>
|
||||
<xsl:for-each select="//*[local-name()='article-meta']/*[local-name()='self-uri' and ./@content-type='application/pdf' and //oaf:datasourceprefix = ('ambientesust', 'qualityinedu')]/@xlink:href/replace(., '/view/', '/download/')">
|
||||
<oaf:fulltext>
|
||||
<xsl:value-of select="."/>
|
||||
</oaf:fulltext>
|
||||
</xsl:for-each>
|
||||
|
||||
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varFP7FundRefDOI) or ends-with(., $varFP7OtherDOI)]]">
|
||||
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
|
||||
<oaf:projectid>
|
||||
<xsl:value-of select="concat($varFP7, ./*[local-name()='award-id'])"/>
|
||||
</oaf:projectid>
|
||||
</xsl:if>
|
||||
</xsl:for-each>
|
||||
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varH2020FundRefDOI)]]">
|
||||
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
|
||||
<oaf:projectid>
|
||||
<xsl:value-of select="concat($varH2020, ./*[local-name()='award-id'])"/>
|
||||
</oaf:projectid>
|
||||
</xsl:if>
|
||||
</xsl:for-each>
|
||||
|
||||
<!-- -->
|
||||
<xsl:variable name='varRights' select="distinct-values((for $i in (
|
||||
//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()=('meta-value', 'meta-name')],
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'license']/@xlink:href,
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read'
|
||||
and not( ./@start_date[(xs:date( max( (string(.), '0001-01-01') ) ) gt current-date())])
|
||||
and not( ./@end_date[(xs:date( max( (string(.), '0001-01-01') ) ) lt current-date())])]/'open',
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read'
|
||||
and (( ./@start_date[(xs:date( max( (string(.), '0001-01-01') ) ) gt current-date())])
|
||||
or ( ./@end_date[(xs:date( max( (string(.), '0001-01-01') ) ) lt current-date())]))]/'embargo')
|
||||
return vocabulary:clean( normalize-space($i), 'dnet:access_modes') "
|
||||
/>
|
||||
|
||||
<!--
|
||||
and not((xs:date( max( (start_date, '0001-01-01') ) ) gt current-date()))
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'free_to_read' and and not((xs:date( max( (./@start_date, '0001-01-01') ) ) gt current-date()))]/'open'
|
||||
-->
|
||||
|
||||
<oaf:accessrights>
|
||||
<xsl:choose>
|
||||
<xsl:when test="$varRights[. = 'EMBARGO']">
|
||||
<xsl:value-of select="'EMBARGO'"/>
|
||||
</xsl:when>
|
||||
<xsl:when test="$varRights[. != 'UNKNOWN']">
|
||||
<xsl:value-of select="$varRights[. != 'UNKNOWN'][1]"/>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="$varRights[1]"/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</oaf:accessrights>
|
||||
|
||||
<!--
|
||||
<oaf:accessrights>
|
||||
<xsl:value-of select="$varRights[1]"/>
|
||||
</oaf:accessrights>
|
||||
|
||||
<xsl:element name="oaf:accessrights">
|
||||
<xsl:value-of select="(//*[local-name()='custom-meta-group']/*[local-name()='custom-meta'][./@specific-use='access-right']/*[local-name()=('meta-value', 'meta-name')],
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'permissions']/*[local-name() = 'license']/@xlink:href)/TransformationFunction:convertString($tf, ., 'AccessRights')" />
|
||||
</xsl:element>
|
||||
-->
|
||||
|
||||
<!--
|
||||
<xsl:element name="dr:CobjCategory">
|
||||
<xsl:variable name='varCobjCategory' select="TransformationFunction:convertString($tf, //*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()='meta-value'], 'TextTypologies')" />
|
||||
<xsl:variable name='varSuperType' select="TransformationFunction:convertString($tf, $varCobjCategory, 'SuperTypes')" />
|
||||
<xsl:attribute name="type" select="$varSuperType"/>
|
||||
<xsl:value-of select="$varCobjCategory" />
|
||||
</xsl:element>
|
||||
|
||||
<xsl:variable name='varCobjCatLst' select="for $i in (
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
|
||||
//*[local-name() = 'article']/@article-type)
|
||||
return TransformationFunction:convertString($tf, normalize-space($i), 'TextTypologies')" />
|
||||
-->
|
||||
|
||||
<xsl:variable name='varTypLst' select="distinct-values((//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
|
||||
//*[local-name() = 'article']/@article-type))"/>
|
||||
<xsl:variable name='varCobjCatLst' select="distinct-values((for $i in $varTypLst
|
||||
return vocabulary:clean( normalize-space($i), 'dnet:dnet:publication_resource')))" />
|
||||
<xsl:variable name='varCobjSupLst' select="for $i in $varCobjCatLst
|
||||
return concat($i, '###', vocabulary:clean( normalize-space($i), 'dnet:result_typologies'))" />
|
||||
<dr:CobjCategory>
|
||||
<xsl:choose>
|
||||
<xsl:when test="count($varCobjSupLst[not(substring-after(., '###') = 'other') and not(substring-before(., '###') = ('0038', '0039', '0040'))]) > 0">
|
||||
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-after(., '###') = 'other') and not(substring-before(., '###') = ('0038', '0039', '0040'))][1]" />
|
||||
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
|
||||
<xsl:value-of select="substring-before($varCobjSup, '###')" />
|
||||
</xsl:when>
|
||||
<xsl:when test="count($varCobjSupLst[not(substring-after(., '###') = 'other')]) > 0">
|
||||
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-after(., '###') = 'other')][1]" />
|
||||
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
|
||||
<xsl:value-of select="substring-before($varCobjSup, '###')" />
|
||||
</xsl:when>
|
||||
<xsl:when test="count($varCobjSupLst[not(substring-before(., '###') = ('0020', '0000'))]) > 0">
|
||||
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-before(., '###') = ('0020', '0000'))][1]" />
|
||||
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
|
||||
<xsl:value-of select="substring-before($varCobjSup, '###')" />
|
||||
</xsl:when>
|
||||
<xsl:when test="count($varCobjSupLst[not(substring-before(., '###') = ('0000'))]) > 0">
|
||||
<xsl:variable name='varCobjSup' select="$varCobjSupLst[not(substring-before(., '###') = ('0000'))][1]" />
|
||||
<xsl:attribute name="type" select="substring-after($varCobjSup, '###')"/>
|
||||
<xsl:value-of select="substring-before($varCobjSup, '###')" />
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:attribute name="type" select="'other'"/>
|
||||
<xsl:value-of select="'0000'" />
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</dr:CobjCategory>
|
||||
|
||||
<!--
|
||||
<xsl:for-each select="$varCobjSupLst">
|
||||
<dc:type>
|
||||
<xsl:value-of select="."/>
|
||||
</dc:type>
|
||||
</xsl:for-each>
|
||||
-->
|
||||
|
||||
<xsl:for-each select="$varTypLst">
|
||||
<dc:type>
|
||||
<xsl:value-of select="."/>
|
||||
</dc:type>
|
||||
</xsl:for-each>
|
||||
|
||||
<!--
|
||||
<xsl:for-each select="(//*[local-name()='article']/@article-type, //*[local-name() = 'custom-meta' and ./@specific-use = 'resource-type']/*[local-name() = ('meta-value', 'meta-name')])">
|
||||
<dc:type>
|
||||
<xsl:value-of select="."/>
|
||||
</dc:type>
|
||||
</xsl:for-each>
|
||||
-->
|
||||
|
||||
<oaf:language>
|
||||
<xsl:value-of select="vocabulary:clean( //*[local-name()='metadata']//*[local-name()='article']/@xml:lang, 'dnet:languages')" />
|
||||
</oaf:language>
|
||||
|
||||
<!-- review status -->
|
||||
<!-- ToDo:
|
||||
review status
|
||||
~ ask Journal.fi to put it elsewhere
|
||||
~ evaluate article-version (no example found yet)
|
||||
subject/kwd:
|
||||
~ handle thesauri (no example found yet)
|
||||
relations:
|
||||
~ handle fn (no example found yet)
|
||||
-->
|
||||
<!--
|
||||
<xsl:variable name="varRefereedConvt" select="for $i in (
|
||||
//*[local-name() = 'article-meta']/*[local-name() = 'custom-meta-group']/*[local-name() = 'custom-meta'][./@specific-use='resource-type']/*[local-name()=('meta-value', 'meta-name')],
|
||||
//*[local-name() = 'article']/@article-type)
|
||||
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')"/>
|
||||
-->
|
||||
|
||||
<xsl:variable name="varRefereedConvt" select="for $i in ($varTypLst)
|
||||
return vocabulary:clean( normalize-space($i), 'dnet:review_levels')"/>
|
||||
<xsl:variable name="varRefereedDescp" select="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]/'0001'"/>
|
||||
<xsl:variable name="varRefereedSubjt" select="//*[local-name() = 'article-categories' and contains(//dri:recordIdentifier, 'oai:journal.fi')]/*[local-name() = 'subj-group' and ./@subj-group-type='heading']/*[local-name() = 'subject' and . = 'Peer reviewed articles']/'0001'"/>
|
||||
<xsl:variable name="varRefereed" select="($varRefereedConvt, $varRefereedDescp, $varRefereedSubjt)"/>
|
||||
<!--
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="$varRefereedDescp"/>
|
||||
</oaf:refereed>
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="$varRefereed"/>
|
||||
</oaf:refereed>
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="count($varRefereed[. = '0001']) > 0"/>
|
||||
</oaf:refereed>
|
||||
-->
|
||||
<xsl:choose>
|
||||
<xsl:when test="count($varRefereed[. = '0001']) > 0">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="count($varRefereed[. = '0002']) > 0">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0002'"/>
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
</xsl:choose>
|
||||
|
||||
<xsl:call-template name="journal">
|
||||
<xsl:with-param name="journalTitle" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']"/>
|
||||
<xsl:with-param name="issn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='ppub']"/>
|
||||
<xsl:with-param name="eissn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='epub']"/>
|
||||
<xsl:with-param name="vol" select="//*[local-name()='article-meta']/*[local-name()='volume']"/>
|
||||
<xsl:with-param name="issue" select="//*[local-name()='article-meta']/*[local-name()='issue']"/>
|
||||
<xsl:with-param name="sp" select="//*[local-name()='article-meta']/*[local-name()='fpage']"/>
|
||||
<xsl:with-param name="ep" select="//*[local-name()='article-meta']/*[local-name()='lpage']"/>
|
||||
</xsl:call-template>
|
||||
<oaf:hostedBy>
|
||||
<xsl:attribute name="name">
|
||||
<xsl:value-of select="$varOfficialName"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="$varDataSourceId"/>
|
||||
</xsl:attribute>
|
||||
</oaf:hostedBy>
|
||||
<oaf:collectedFrom>
|
||||
<xsl:attribute name="name">
|
||||
<xsl:value-of select="$varOfficialName"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="$varDataSourceId"/>
|
||||
</xsl:attribute>
|
||||
</oaf:collectedFrom>
|
||||
</metadata>
|
||||
<xsl:copy-of select="//*[local-name() = 'about']" />
|
||||
</record>
|
||||
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="allElements">
|
||||
<xsl:param name="sourceElement"/>
|
||||
<xsl:param name="targetElement"/>
|
||||
<xsl:for-each select="$sourceElement">
|
||||
<xsl:element name="{$targetElement}">
|
||||
<xsl:if test="(.[@xml:lang] or ..[@xml:lang]) and $targetElement = ('dc:title', 'dc:description', 'dc:subject')">
|
||||
<xsl:attribute name="xml:lang">
|
||||
<xsl:value-of select="(./@xml:lang, ../@xml:lang)[1]"/>
|
||||
</xsl:attribute>
|
||||
</xsl:if>
|
||||
<xsl:value-of select="normalize-space(.)"/>
|
||||
</xsl:element>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="title">
|
||||
<xsl:param name="sourceElement"/>
|
||||
<xsl:for-each select="$sourceElement">
|
||||
<xsl:element name="dc:title">
|
||||
<xsl:if test=".[@xml:lang] or ..[@xml:lang]">
|
||||
<xsl:attribute name="xml:lang">
|
||||
<xsl:value-of select="(./@xml:lang, ../@xml:lang)[1]"/>
|
||||
</xsl:attribute>
|
||||
</xsl:if>
|
||||
<xsl:value-of select="string-join((., ./following-sibling::*[local-name() = ('subtitle', 'trans-subtitle')])/normalize-space(.), ': ')"/>
|
||||
</xsl:element>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="journal">
|
||||
<xsl:param name="journalTitle"/>
|
||||
<xsl:param name="issn"/>
|
||||
<xsl:param name="eissn"/>
|
||||
<xsl:param name="vol"/>
|
||||
<xsl:param name="issue"/>
|
||||
<xsl:param name="sp"/>
|
||||
<xsl:param name="ep"/>
|
||||
<xsl:element name="oaf:journal">
|
||||
<xsl:attribute name="issn">
|
||||
<xsl:value-of select="normalize-space($issn)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="eissn">
|
||||
<xsl:value-of select="normalize-space($eissn)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="vol">
|
||||
<xsl:value-of select="normalize-space($vol)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="iss">
|
||||
<xsl:value-of select="normalize-space($issue)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="sp">
|
||||
<xsl:value-of select="normalize-space($sp)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="ep">
|
||||
<xsl:value-of select="normalize-space($ep)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="normalize-space($journalTitle)"/>
|
||||
</xsl:element>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
<xsl:template name="identifiers">
|
||||
<xsl:param name="sourceElement"/>
|
||||
<xsl:if test="string-length($sourceElement[@pub-id-type='doi']) gt 0">
|
||||
<xsl:element name="oaf:identifier">
|
||||
<xsl:attribute name="identifierType">
|
||||
<xsl:text>doi</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="$sourceElement[@pub-id-type='doi']"/>
|
||||
</xsl:element>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
<xsl:template name="authors">
|
||||
<xsl:param name="sourceElement"/>
|
||||
<xsl:for-each select="$sourceElement">
|
||||
<xsl:element name="dc:creator">
|
||||
<xsl:if test="./*[local-name()='contrib-id'][@contrib-id-type='orcid']">
|
||||
<xsl:attribute name="nameIdentifierScheme">
|
||||
<xsl:text>ORCID</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="schemeURI">
|
||||
<xsl:text>http://orcid.org/</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="nameIdentifier">
|
||||
<xsl:value-of select="substring-after(./*[local-name()='contrib-id'][@contrib-id-type='orcid'], 'http://orcid.org/')"/>
|
||||
</xsl:attribute>
|
||||
</xsl:if>
|
||||
<xsl:value-of select="concat(normalize-space(./*[local-name()='name']/*[local-name()='surname']), ', ', normalize-space(./*[local-name()='name']/*[local-name()='given-names']))"/>
|
||||
</xsl:element>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
|
||||
<xsl:template match="//*[local-name() = 'header']">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*"/>
|
||||
<xsl:element name="dr:dateOfTransformation">
|
||||
<xsl:value-of select="$transDate"/>
|
||||
</xsl:element>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
<xsl:template match="node()|@*">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
|
@ -0,0 +1,373 @@
|
|||
<!-- for adaptation , 2021-06-14 PROD -->
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
|
||||
xmlns:vocabulary="http://eu/dnetlib/transform/clean"
|
||||
xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:datacite="http://datacite.org/schema/kernel-4"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
exclude-result-prefixes="xsl vocabulary dateCleaner"
|
||||
version="2.0">
|
||||
|
||||
<xsl:param name="varOfficialName" />
|
||||
<xsl:param name="varDsType" />
|
||||
<xsl:param name="varDataSourceId" />
|
||||
<xsl:output indent="yes" omit-xml-declaration="yes" />
|
||||
<xsl:param name="varHostedById" select="'opendoar____::908'" />
|
||||
<xsl:param name="varHostedByName" select="'Europe PubMed Central'" />
|
||||
|
||||
<xsl:param name="varFP7FundRefDOI" select="'10.13039/501100004963'" />
|
||||
<xsl:param name="varH2020FundRefDOI" select="'10.13039/501100007601'" />
|
||||
<xsl:param name="varFP7" select="'corda_______::'" />
|
||||
<xsl:param name="varH2020" select="'corda__h2020::'" />
|
||||
<xsl:param name="epmcUrlPrefix" select="'http://europepmc.org/articles/'" />
|
||||
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')" />
|
||||
<xsl:param name="index" select="0" />
|
||||
<xsl:param name="transDate" select="current-dateTime()" />
|
||||
<xsl:variable name="year" select="format-number( ( //*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='year'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='year']), '0000')" />
|
||||
<xsl:variable name="month" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='month'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='month']), '00')" />
|
||||
<xsl:variable name="day" select="format-number( (//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub']/*[local-name()='day'] | //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']/*[local-name()='day']), '00')" />
|
||||
<xsl:template name="terminate">
|
||||
<xsl:message terminate="yes">
|
||||
record is not compliant, transformation is interrupted.
|
||||
</xsl:message>
|
||||
</xsl:template>
|
||||
<xsl:template match="/">
|
||||
<record>
|
||||
<xsl:apply-templates select="//*[local-name() = 'header']" />
|
||||
<metadata>
|
||||
<xsl:if test="not(//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0])">
|
||||
<xsl:call-template name="terminate" />
|
||||
</xsl:if>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']//*[local-name()='article-title'][string-length(normalize-space(.))> 0]" />
|
||||
<xsl:with-param name="targetElement" select="'dc:title'" />
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="authors">
|
||||
<!--
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'contrib'][@contrib-type='author']"/>
|
||||
-->
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']/*[local-name() = 'contrib'][@contrib-type='author'][not(exists(child::*:collab))][./*[local-name()='name'] or ./*[local-name()='name-alternatives']/*[local-name()='name']][string-length(.//*[local-name()='surname']) + string-length(.//*[local-name()='given-names']) > 0]" />
|
||||
</xsl:call-template> <!-- <xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//dc:contributor"/>
|
||||
<xsl:with-param name="targetElement" select="'dc:contributor'"/>
|
||||
</xsl:call-template>
|
||||
-->
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='article-meta']/*[local-name()='abstract']" />
|
||||
<xsl:with-param name="targetElement" select="'dc:description'" />
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='article-categories']//*[local-name()='subject']" />
|
||||
<xsl:with-param name="targetElement" select="'dc:subject'" />
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='kwd-group' and not(lower-case(@kwd-group-type)=('mesh', 'ocis'))]//*[local-name()='kwd']" />
|
||||
<xsl:with-param name="targetElement" select="'dc:subject'" />
|
||||
</xsl:call-template>
|
||||
<xsl:for-each select="//*[local-name()='kwd-group' and lower-case(@kwd-group-type)='mesh' and ./*[local-name()='kwd']]">
|
||||
<xsl:for-each select="./*[local-name()='kwd']">
|
||||
<dc:subject>
|
||||
<xsl:attribute name="subjectScheme" select="'mesh'" />
|
||||
<xsl:attribute name="schemeURI" select="'http://www.nlm.nih.gov/mesh/'" />
|
||||
<xsl:attribute name="valueURI" select="''" />
|
||||
<xsl:value-of select="./concat('mesh:', replace(., 'mesh (.*)$', '$1'))" />
|
||||
</dc:subject>
|
||||
</xsl:for-each>
|
||||
</xsl:for-each>
|
||||
<xsl:for-each select="//*[local-name()='kwd-group' and lower-case(@kwd-group-type)='ocis' and ./*[local-name()='kwd']]">
|
||||
<xsl:for-each select="./*[local-name()='kwd']">
|
||||
<dc:subject>
|
||||
<xsl:attribute name="subjectScheme" select="'ocis'" />
|
||||
<xsl:attribute name="schemeURI" select="''" />
|
||||
<xsl:attribute name="valueURI" select="''" />
|
||||
<xsl:value-of select="./concat('ocis:', .)" />
|
||||
</dc:subject>
|
||||
</xsl:for-each>
|
||||
</xsl:for-each>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='publisher']/*[local-name()='publisher-name']" />
|
||||
<xsl:with-param name="targetElement" select="'dc:publisher'" />
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']" />
|
||||
<xsl:with-param name="targetElement" select="'dc:source'" />
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name() = 'article-meta']/(*[local-name() = 'article-version-alternatives']/*[local-name() = 'article-version'], *[local-name() = 'article-version'])/concat('article-version (', @article-version-type, ') ', .)" />
|
||||
<xsl:with-param name="targetElement" select="'dc:source'" />
|
||||
</xsl:call-template>
|
||||
<xsl:element name="dc:language">
|
||||
<xsl:text>eng</xsl:text>
|
||||
</xsl:element>
|
||||
<xsl:element name="dc:identifier">
|
||||
<xsl:value-of select="concat($epmcUrlPrefix, //*[local-name()='article-id'][@pub-id-type='pmcid'])" />
|
||||
</xsl:element>
|
||||
<xsl:element name="oaf:fulltext">
|
||||
<xsl:value-of select="concat($epmcUrlPrefix, //*[local-name()='article-id'][@pub-id-type='pmcid'])" />
|
||||
</xsl:element>
|
||||
<xsl:element name="oaf:dateAccepted">
|
||||
<xsl:choose>
|
||||
<xsl:when test="//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='epub'] or //*[local-name()='article-meta']//*[local-name()='pub-date'][@date-type='pub' and @publication-format='electronic']">
|
||||
<xsl:if test="string(number($month)) eq 'NaN'">
|
||||
<xsl:value-of select="concat($year, '-', '01', '-', '01')" />
|
||||
</xsl:if>
|
||||
<xsl:if test="string(number($month)) != 'NaN'">
|
||||
<xsl:value-of select="concat($year, '-', $month, '-', '01')" />
|
||||
</xsl:if>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="concat(//*[local-name()='article-meta']//*[local-name()='pub-date'][@pub-type='ppub']/*[local-name()='year'], '-01-01')" />
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:element>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="distinct-values(//*[local-name()='permissions']/*[local-name()='copyright-statement'])" />
|
||||
<xsl:with-param name="targetElement" select="'dc:rights'" />
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="distinct-values(//*[local-name()='permissions']/*[local-name()='license'])" />
|
||||
<xsl:with-param name="targetElement" select="'dc:rights'" />
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="allElements">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='fn-group']//*[local-name()='fn']" />
|
||||
<xsl:with-param name="targetElement" select="'dc:relation'" />
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="identifiers">
|
||||
<xsl:with-param name="sourceElement" select="//*[local-name()='article-id']" />
|
||||
</xsl:call-template>
|
||||
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varFP7FundRefDOI)]]">
|
||||
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
|
||||
<oaf:projectid>
|
||||
<xsl:value-of select="concat($varFP7, ./*[local-name()='award-id'])" />
|
||||
</oaf:projectid>
|
||||
</xsl:if>
|
||||
</xsl:for-each>
|
||||
<xsl:for-each select="//*[local-name()='award-group'][.//*[local-name()='institution-id'][ends-with(., $varH2020FundRefDOI)]]">
|
||||
<xsl:if test="./*[local-name()='award-id'][matches(normalize-space(.), '(^\d\d\d\d\d\d$)', 'i')]">
|
||||
<oaf:projectid>
|
||||
<xsl:value-of select="concat($varH2020, ./*[local-name()='award-id'])" />
|
||||
</oaf:projectid>
|
||||
</xsl:if>
|
||||
</xsl:for-each>
|
||||
<xsl:element name="oaf:accessrights">
|
||||
<xsl:text>OPEN</xsl:text>
|
||||
</xsl:element>
|
||||
<xsl:element name="dr:CobjCategory">
|
||||
<xsl:attribute name="type" select="'publication'" />
|
||||
<xsl:text>0001</xsl:text>
|
||||
</xsl:element>
|
||||
<dc:type>
|
||||
<xsl:value-of select="//*[local-name() = 'article']/@article-type" />
|
||||
</dc:type>
|
||||
|
||||
|
||||
<xsl:variable name="varRefereedConvt" select="for $i in (//*[local-name() = 'resource']/*[local-name() = ('resourceType', 'version')]/(., @uri))
|
||||
return vocabulary:clean( normalize-space($i), 'dnet:review_levels')"/>
|
||||
|
||||
<!-- <xsl:variable name="varRefereedConvt" select="for $i in distinct-values((//*[local-name() = 'article']/@article-type, //oai:setSpec))
|
||||
return TransformationFunction:convertString($tf, normalize-space($i), 'ReviewLevels')" />
|
||||
-->
|
||||
<xsl:choose>
|
||||
<xsl:when test="count($varRefereedConvt[. = '0001']) > 0">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'" />
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = 'article-meta']/*[local-name() = 'article-id'][@pub-id-type='doi'][matches(., '^(https?://(dx\.)?doi.org/)?10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$')]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'" />
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = 'article-meta']/*[local-name() = ('abstract', 'trans-abstract')][matches(lower-case(.), '^\s*(.p.\s*)?refereed\s*article.*')]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'" />
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = 'article']/*[local-name() = ('back', 'front')]/*[local-name() = ('fn-group', 'notes')][
|
||||
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*') or
|
||||
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*review\s*information.*') or
|
||||
matches(lower-case(.), '.*the\s*peer[\.\-_/\s\(\)]*review\s*history\s*for\s*this\s*article\s*is\s*available\s*at .*') or
|
||||
matches(lower-case(.), '.*provenance\s*and\s*peer[\.\-_/\s\(\)]*review.*') or
|
||||
matches(lower-case(.), '.*externally\s*peer[\.\-_/\s\(\)]*reviewed.*') or
|
||||
matches(lower-case(.), '.*peer[\.\-_/\s\(\)]*reviewed\s*by.*') or
|
||||
matches(lower-case(.), '.*refereed\s*anonymously.*') or
|
||||
matches(lower-case(.), '.*peer\s*reviewer\s*reports\s*are\s*available.*') or
|
||||
matches(lower-case(.), '.*\[.*peer[\s\-\._]*review\s*:.*\].*') or
|
||||
matches(lower-case(.), '.*\[.*referees\s*:.*\].*') or
|
||||
matches(lower-case(.), '^\s*plagiarism[\s\-\._]check.*') or
|
||||
matches(lower-case(.), '^\s*peer[\s\-\._]*review.*') or
|
||||
matches(lower-case(.), '^\s*(open\s*peer[\s\-\._]*|p-)reviewer.*') or
|
||||
matches(lower-case(.), '^\s*(open\s*peer[\s\-\._]*|p-)review\s*reports?.*')]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'" />
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = ('article-meta', 'app', 'app-group')]/*[local-name() = 'supplementary-material']/*[local-name() = 'media'][
|
||||
matches(lower-case(.), '.*peer\s*review\s*file.*')]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'" />
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = 'article-meta']/*[local-name() = 'contrib-group']
|
||||
[./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or
|
||||
./*[local-name() = 'contrib'][./@role/lower-case(.) = ('reviewer', 'solicited external reviewer') or ./*[local-name() = 'role' and lower-case(.) = ('reviewer', 'solicited external reviewer')] or ./@contrib-type/lower-case(.) = 'reviewer']]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0001'" />
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="count($varRefereedConvt[. = '0002']) > 0">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0002'" />
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = ('related-article')][./@related-article-type = ('peer-reviewed-article', 'reviewed-article')]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0002'" />
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
<xsl:when test="//*[local-name() = 'article-meta'][./*[local-name() = 'article-version-alternatives']/*[local-name() = 'article-version' and . = 'preprint'] or ./*[local-name() = 'article-version' and . = 'preprint']]">
|
||||
<oaf:refereed>
|
||||
<xsl:value-of select="'0002'" />
|
||||
</oaf:refereed>
|
||||
</xsl:when>
|
||||
</xsl:choose>
|
||||
<xsl:call-template name="journal">
|
||||
<xsl:with-param name="journalTitle" select="//*[local-name()='journal-meta']//*[local-name()='journal-title']" />
|
||||
<xsl:with-param name="issn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='ppub']" />
|
||||
<xsl:with-param name="eissn" select="//*[local-name()='journal-meta']/*[local-name()='issn'][@pub-type='epub']" />
|
||||
<xsl:with-param name="vol" select="//*[local-name()='article-meta']/*[local-name()='volume']" />
|
||||
<xsl:with-param name="issue" select="//*[local-name()='article-meta']/*[local-name()='issue']" />
|
||||
<xsl:with-param name="sp" select="//*[local-name()='article-meta']/*[local-name()='fpage']" />
|
||||
<xsl:with-param name="ep" select="//*[local-name()='article-meta']/*[local-name()='lpage']" />
|
||||
</xsl:call-template>
|
||||
<oaf:hostedBy>
|
||||
<xsl:attribute name="name">
|
||||
<xsl:value-of select="$varHostedByName" />
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="$varHostedById" />
|
||||
</xsl:attribute>
|
||||
</oaf:hostedBy>
|
||||
<oaf:collectedFrom>
|
||||
<xsl:attribute name="name">
|
||||
<xsl:value-of select="$varOfficialName" />
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="$varDataSourceId" />
|
||||
</xsl:attribute>
|
||||
</oaf:collectedFrom>
|
||||
<xsl:for-each select="//*[local-name() = 'article']/*[local-name() = ('back', 'front')]/*[local-name() = 'fn-group']/*[local-name() = 'fn'][matches(lower-case(.), 'country(/territory)? of origin:?\s*[A-Za-z\-]+')]">
|
||||
<oaf:country>
|
||||
<!--
|
||||
<xsl:value-of select="TransformationFunction:convertString($tf, replace(lower-case(.), '^(.|\s)*country(/territory)? of origin:?\s+([A-Za-z\-,\(\)]+(\s+[A-Za-z\-,\(\)]+)*)(.|\s)*$', '$3'), 'Countries')"/>
|
||||
-->
|
||||
<!-- ACz, 2021-06-14
|
||||
<xsl:value-of select="TransformationFunction:convertString($tf, normalize-space(substring(substring-after(lower-case(.), 'of origin'), 2)), 'Countries')" />
|
||||
-->
|
||||
<xsl:value-of select="vocabulary:clean( normalize-space(substring(substring-after(lower-case(.), 'of origin'), 2)), 'dnet:countries')"/>
|
||||
</oaf:country>
|
||||
</xsl:for-each>
|
||||
</metadata>
|
||||
<xsl:copy-of select="//*[local-name() = 'about']" />
|
||||
</record>
|
||||
</xsl:template>
|
||||
<xsl:template name="allElements">
|
||||
<xsl:param name="sourceElement" />
|
||||
<xsl:param name="targetElement" />
|
||||
<xsl:for-each select="$sourceElement">
|
||||
<xsl:element name="{$targetElement}">
|
||||
<xsl:value-of select="normalize-space(.)" />
|
||||
</xsl:element>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
<xsl:template name="journal">
|
||||
<xsl:param name="journalTitle" />
|
||||
<xsl:param name="issn" />
|
||||
<xsl:param name="eissn" />
|
||||
<xsl:param name="vol" />
|
||||
<xsl:param name="issue" />
|
||||
<xsl:param name="sp" />
|
||||
<xsl:param name="ep" />
|
||||
<xsl:element name="oaf:journal">
|
||||
<xsl:attribute name="issn">
|
||||
<xsl:value-of select="normalize-space($issn)" />
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="eissn">
|
||||
<xsl:value-of select="normalize-space($eissn)" />
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="vol">
|
||||
<xsl:value-of select="normalize-space($vol)" />
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="iss">
|
||||
<xsl:value-of select="normalize-space($issue)" />
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="sp">
|
||||
<xsl:value-of select="normalize-space($sp)" />
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="ep">
|
||||
<xsl:value-of select="normalize-space($ep)" />
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="normalize-space($journalTitle)" />
|
||||
</xsl:element>
|
||||
</xsl:template>
|
||||
<xsl:template name="identifiers">
|
||||
<xsl:param name="sourceElement" />
|
||||
<xsl:element name="oaf:identifier">
|
||||
<xsl:attribute name="identifierType">
|
||||
<xsl:text>doi</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="$sourceElement[@pub-id-type='doi']" />
|
||||
</xsl:element>
|
||||
<xsl:element name="oaf:identifier">
|
||||
<xsl:attribute name="identifierType">
|
||||
<xsl:text>pmc</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="$sourceElement[@pub-id-type='pmcid']" />
|
||||
</xsl:element>
|
||||
<xsl:element name="oaf:identifier">
|
||||
<xsl:attribute name="identifierType">
|
||||
<xsl:text>pmid</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="$sourceElement[@pub-id-type='pmid']" />
|
||||
</xsl:element>
|
||||
</xsl:template>
|
||||
<xsl:template name="authors">
|
||||
<xsl:param name="sourceElement" />
|
||||
<xsl:for-each select="$sourceElement">
|
||||
<xsl:element name="dc:creator">
|
||||
<xsl:if test="./*[local-name()='contrib-id'][@contrib-id-type='orcid']">
|
||||
<xsl:attribute name="nameIdentifierScheme">
|
||||
<xsl:text>ORCID</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="schemeURI">
|
||||
<xsl:text>http://orcid.org/</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="nameIdentifier">
|
||||
<xsl:value-of select="substring-after(./*[local-name()='contrib-id'][@contrib-id-type='orcid'], 'http://orcid.org/')" />
|
||||
</xsl:attribute>
|
||||
</xsl:if> <!--
|
||||
<xsl:value-of select="concat(normalize-space(./*[local-name()='name']/*[local-name()='surname']), ', ', normalize-space(./*[local-name()='name']/*[local-name()='given-names']))"/>
|
||||
-->
|
||||
<xsl:value-of select="concat(normalize-space(./(*[local-name()='name'], *[local-name()='name-alternatives']/*[local-name()='name'])/*[local-name()='surname']), ', ', normalize-space(./(*[local-name()='name'], *[local-name()='name-alternatives']/*[local-name()='name'])/*[local-name()='given-names']))" />
|
||||
</xsl:element>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
<xsl:template match="//*[local-name() = 'header']">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*" />
|
||||
<xsl:element name="dr:dateOfTransformation">
|
||||
<xsl:value-of select="$transDate" />
|
||||
</xsl:element>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
<xsl:template match="node()|@*">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*" />
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
|
@ -4,6 +4,8 @@ package eu.dnetlib.dhp.broker.oa;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
|
@ -47,26 +49,22 @@ public class CheckDuplictedIdsJob {
|
|||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("invaild_event_id");
|
||||
|
||||
final TypedColumn<Tuple2<String, Long>, Tuple2<String, Long>> agg = new CountAggregator().toColumn();
|
||||
|
||||
final Encoder<Tuple2<String, Long>> encoder = Encoders.tuple(Encoders.STRING(), Encoders.LONG());
|
||||
ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.map(e -> new Tuple2<>(e.getEventId(), 1l), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.groupByKey(t -> t._1, Encoders.STRING())
|
||||
.agg(agg)
|
||||
.map(t -> t._2, Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.filter(t -> t._2 > 1)
|
||||
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.map((MapFunction<Event, Tuple2<String, Long>>) e -> new Tuple2<>(e.getEventId(), 1l), encoder)
|
||||
.groupByKey((MapFunction<Tuple2<String, Long>, String>) t -> t._1, Encoders.STRING())
|
||||
.agg(new CountAggregator().toColumn())
|
||||
.map((MapFunction<Tuple2<String, Tuple2<String, Long>>, Tuple2<String, Long>>) t -> t._2, encoder)
|
||||
.filter((FilterFunction<Tuple2<String, Long>>) t -> t._2 > 1)
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, Long>, Tuple2<String, Long>>) o -> ClusterUtils
|
||||
.incrementAccumulator(o, total),
|
||||
encoder)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(countPath);
|
||||
;
|
||||
|
||||
}
|
||||
|
||||
private static String eventAsJsonString(final Event f) throws JsonProcessingException {
|
||||
return new ObjectMapper().writeValueAsString(f);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -12,6 +12,8 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.SparkContext;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
@ -77,11 +79,11 @@ public class GenerateEventsJob {
|
|||
|
||||
final Dataset<Event> dataset = groups
|
||||
.map(
|
||||
g -> EventFinder
|
||||
(MapFunction<ResultGroup, EventGroup>) g -> EventFinder
|
||||
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, topicWhitelist, accumulators),
|
||||
Encoders
|
||||
.bean(EventGroup.class))
|
||||
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
|
||||
.flatMap((FlatMapFunction<EventGroup, Event>) g -> g.getData().iterator(), Encoders.bean(Event.class));
|
||||
|
||||
ClusterUtils.save(dataset, eventsPath, Event.class, total);
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ import org.apache.http.client.methods.HttpGet;
|
|||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
|
@ -24,6 +25,7 @@ import eu.dnetlib.dhp.broker.model.Event;
|
|||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.stats.DatasourceStats;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.stats.StatsAggregator;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateStatsJob {
|
||||
|
||||
|
@ -71,9 +73,14 @@ public class GenerateStatsJob {
|
|||
|
||||
ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.groupByKey(e -> e.getTopic() + "@@@" + e.getMap().getTargetDatasourceId(), Encoders.STRING())
|
||||
.groupByKey(
|
||||
(MapFunction<Event, String>) e -> e.getTopic() + "@@@" + e.getMap().getTargetDatasourceId(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(DatasourceStats.class))
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, DatasourceStats>, DatasourceStats>) t -> t._2,
|
||||
Encoders.bean(DatasourceStats.class))
|
||||
.coalesce(1)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.jdbc(dbUrl, "oa_datasource_stats_temp", connectionProperties);
|
||||
|
|
|
@ -13,6 +13,8 @@ import org.apache.http.impl.client.CloseableHttpClient;
|
|||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
@ -30,6 +32,7 @@ import eu.dnetlib.dhp.broker.model.Event;
|
|||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.EventGroup;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.subset.EventSubsetAggregator;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class IndexEventSubsetJob {
|
||||
|
||||
|
@ -83,13 +86,15 @@ public class IndexEventSubsetJob {
|
|||
|
||||
final Dataset<Event> subset = ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.groupByKey(e -> e.getTopic() + '@' + e.getMap().getTargetDatasourceId(), Encoders.STRING())
|
||||
.groupByKey(
|
||||
(MapFunction<Event, String>) e -> e.getTopic() + '@' + e.getMap().getTargetDatasourceId(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(EventGroup.class))
|
||||
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
|
||||
.map((MapFunction<Tuple2<String, EventGroup>, EventGroup>) t -> t._2, Encoders.bean(EventGroup.class))
|
||||
.flatMap((FlatMapFunction<EventGroup, Event>) g -> g.getData().iterator(), Encoders.bean(Event.class));
|
||||
|
||||
final JavaRDD<String> inputRdd = subset
|
||||
.map(e -> prepareEventForIndexing(e, now, total), Encoders.STRING())
|
||||
.map((MapFunction<Event, String>) e -> prepareEventForIndexing(e, now, total), Encoders.STRING())
|
||||
.javaRDD();
|
||||
|
||||
final Map<String, String> esCfg = new HashMap<>();
|
||||
|
|
|
@ -18,7 +18,10 @@ import org.apache.http.impl.client.CloseableHttpClient;
|
|||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
@ -89,13 +92,17 @@ public class IndexNotificationsJob {
|
|||
log.info("Number of subscriptions: " + subscriptions.size());
|
||||
|
||||
if (subscriptions.size() > 0) {
|
||||
final Encoder<NotificationGroup> ngEncoder = Encoders.bean(NotificationGroup.class);
|
||||
final Encoder<Notification> nEncoder = Encoders.bean(Notification.class);
|
||||
final Dataset<Notification> notifications = ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.map(e -> generateNotifications(e, subscriptions, startTime), Encoders.bean(NotificationGroup.class))
|
||||
.flatMap(g -> g.getData().iterator(), Encoders.bean(Notification.class));
|
||||
.map(
|
||||
(MapFunction<Event, NotificationGroup>) e -> generateNotifications(e, subscriptions, startTime),
|
||||
ngEncoder)
|
||||
.flatMap((FlatMapFunction<NotificationGroup, Notification>) g -> g.getData().iterator(), nEncoder);
|
||||
|
||||
final JavaRDD<String> inputRdd = notifications
|
||||
.map(n -> prepareForIndexing(n, total), Encoders.STRING())
|
||||
.map((MapFunction<Notification, String>) n -> prepareForIndexing(n, total), Encoders.STRING())
|
||||
.javaRDD();
|
||||
|
||||
final Map<String, String> esCfg = new HashMap<>();
|
||||
|
@ -192,15 +199,11 @@ public class IndexNotificationsJob {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (conditions.containsKey("targetSubjects")
|
||||
&& !conditions
|
||||
return !conditions.containsKey("targetSubjects")
|
||||
|| conditions
|
||||
.get("targetSubjects")
|
||||
.stream()
|
||||
.allMatch(c -> SubscriptionUtils.verifyListExact(map.getTargetSubjects(), c.getValue()))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
.allMatch(c -> SubscriptionUtils.verifyListExact(map.getTargetSubjects(), c.getValue()));
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ import java.util.Optional;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
|
@ -67,9 +68,13 @@ public class JoinStep0Job {
|
|||
|
||||
final Dataset<OaBrokerMainEntity> dataset = sources
|
||||
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
|
||||
.groupByKey(t -> t._1.getOpenaireId(), Encoders.STRING())
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedDatasource>, String>) t -> t._1.getOpenaireId(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, OaBrokerMainEntity>, OaBrokerMainEntity>) t -> t._2,
|
||||
Encoders.bean(OaBrokerMainEntity.class));
|
||||
|
||||
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
|
||||
|
||||
|
|
|
@ -69,7 +69,9 @@ public class JoinStep1Job {
|
|||
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedProject>, String>) t -> t._1.getOpenaireId(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, OaBrokerMainEntity>, OaBrokerMainEntity>) t -> t._2,
|
||||
Encoders.bean(OaBrokerMainEntity.class));
|
||||
|
||||
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ import java.util.Optional;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
|
@ -64,9 +65,13 @@ public class JoinStep2Job {
|
|||
|
||||
final Dataset<OaBrokerMainEntity> dataset = sources
|
||||
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
|
||||
.groupByKey(t -> t._1.getOpenaireId(), Encoders.STRING())
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedSoftware>, String>) t -> t._1.getOpenaireId(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, OaBrokerMainEntity>, OaBrokerMainEntity>) t -> t._2,
|
||||
Encoders.bean(OaBrokerMainEntity.class));
|
||||
|
||||
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
|
||||
|
||||
|
|
|
@ -69,7 +69,9 @@ public class JoinStep3Job {
|
|||
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedDataset>, String>) t -> t._1.getOpenaireId(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, OaBrokerMainEntity>, OaBrokerMainEntity>) t -> t._2,
|
||||
Encoders.bean(OaBrokerMainEntity.class));
|
||||
|
||||
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
|
||||
|
||||
|
|
|
@ -69,7 +69,9 @@ public class JoinStep4Job {
|
|||
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedPublication>, String>) t -> t._1.getOpenaireId(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, OaBrokerMainEntity>, OaBrokerMainEntity>) t -> t._2,
|
||||
Encoders.bean(OaBrokerMainEntity.class));
|
||||
|
||||
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ import java.util.Optional;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
|
@ -64,7 +65,7 @@ public class PrepareGroupsJob {
|
|||
|
||||
final Dataset<Relation> mergedRels = ClusterUtils
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
.filter((FilterFunction<Relation>) r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()
|
||||
.toColumn();
|
||||
|
@ -75,8 +76,9 @@ public class PrepareGroupsJob {
|
|||
(MapFunction<Tuple2<OaBrokerMainEntity, Relation>, String>) t -> t._2.getTarget(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(ResultGroup.class))
|
||||
.filter(rg -> rg.getData().size() > 1);
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class))
|
||||
.filter((FilterFunction<ResultGroup>) rg -> rg.getData().size() > 1);
|
||||
|
||||
ClusterUtils.save(dataset, groupsPath, ResultGroup.class, total);
|
||||
|
||||
|
|
|
@ -7,6 +7,8 @@ import java.util.Optional;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
@ -20,6 +22,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
|||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class PrepareRelatedDatasetsJob {
|
||||
|
||||
|
@ -58,20 +61,22 @@ public class PrepareRelatedDatasetsJob {
|
|||
|
||||
final Dataset<OaBrokerRelatedDataset> datasets = ClusterUtils
|
||||
.readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class)
|
||||
.filter(d -> !ClusterUtils.isDedupRoot(d.getId()))
|
||||
.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));
|
||||
.filter((FilterFunction<eu.dnetlib.dhp.schema.oaf.Dataset>) d -> !ClusterUtils.isDedupRoot(d.getId()))
|
||||
.map(
|
||||
(MapFunction<eu.dnetlib.dhp.schema.oaf.Dataset, OaBrokerRelatedDataset>) ConversionUtils::oafDatasetToBrokerDataset,
|
||||
Encoders.bean(OaBrokerRelatedDataset.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
.filter((FilterFunction<Relation>) r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter((FilterFunction<Relation>) r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||
.filter((FilterFunction<Relation>) r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
|
||||
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
final Dataset<RelatedDataset> dataset = rels
|
||||
.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> {
|
||||
.map((MapFunction<Tuple2<Relation, OaBrokerRelatedDataset>, RelatedDataset>) t -> {
|
||||
final RelatedDataset rel = new RelatedDataset(t._1.getSource(),
|
||||
t._2);
|
||||
rel.getRelDataset().setRelType(t._1.getRelClass());
|
||||
|
|
|
@ -7,6 +7,9 @@ import java.util.Optional;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
@ -25,6 +28,7 @@ import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
|||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import scala.Tuple2;
|
||||
import scala.Tuple3;
|
||||
|
||||
public class PrepareRelatedDatasourcesJob {
|
||||
|
@ -70,17 +74,20 @@ public class PrepareRelatedDatasourcesJob {
|
|||
|
||||
final Dataset<OaBrokerRelatedDatasource> datasources = ClusterUtils
|
||||
.readPath(spark, graphPath + "/datasource", Datasource.class)
|
||||
.map(ConversionUtils::oafDatasourceToBrokerDatasource, Encoders.bean(OaBrokerRelatedDatasource.class));
|
||||
.map(
|
||||
(MapFunction<Datasource, OaBrokerRelatedDatasource>) ConversionUtils::oafDatasourceToBrokerDatasource,
|
||||
Encoders.bean(OaBrokerRelatedDatasource.class));
|
||||
|
||||
final Dataset<RelatedDatasource> dataset = rels
|
||||
.joinWith(datasources, datasources.col("openaireId").equalTo(rels.col("_2")), "inner")
|
||||
.map(t -> {
|
||||
final RelatedDatasource r = new RelatedDatasource();
|
||||
r.setSource(t._1._1());
|
||||
r.setRelDatasource(t._2);
|
||||
r.getRelDatasource().setRelType(t._1._3());
|
||||
return r;
|
||||
}, Encoders.bean(RelatedDatasource.class));
|
||||
.map(
|
||||
(MapFunction<Tuple2<Tuple3<String, String, String>, OaBrokerRelatedDatasource>, RelatedDatasource>) t -> {
|
||||
final RelatedDatasource r = new RelatedDatasource();
|
||||
r.setSource(t._1._1());
|
||||
r.setRelDatasource(t._2);
|
||||
r.getRelDatasource().setRelType(t._1._3());
|
||||
return r;
|
||||
}, Encoders.bean(RelatedDatasource.class));
|
||||
|
||||
ClusterUtils.save(dataset, relsPath, RelatedDatasource.class, total);
|
||||
|
||||
|
@ -88,19 +95,22 @@ public class PrepareRelatedDatasourcesJob {
|
|||
|
||||
}
|
||||
|
||||
private static final Dataset<Tuple3<String, String, String>> prepareResultTuples(final SparkSession spark,
|
||||
private static final <T extends Result> Dataset<Tuple3<String, String, String>> prepareResultTuples(
|
||||
final SparkSession spark,
|
||||
final String graphPath,
|
||||
final Class<? extends Result> sourceClass) {
|
||||
final Class<T> sourceClass) {
|
||||
|
||||
return ClusterUtils
|
||||
.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getId()))
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter((FilterFunction<T>) r -> !ClusterUtils.isDedupRoot(r.getId()))
|
||||
.filter((FilterFunction<T>) r -> r.getDataInfo().getDeletedbyinference())
|
||||
.map(
|
||||
r -> DatasourceRelationsAccumulator.calculateTuples(r),
|
||||
(MapFunction<T, DatasourceRelationsAccumulator>) r -> DatasourceRelationsAccumulator.calculateTuples(r),
|
||||
Encoders.bean(DatasourceRelationsAccumulator.class))
|
||||
.flatMap(
|
||||
acc -> acc.getRels().iterator(),
|
||||
(FlatMapFunction<DatasourceRelationsAccumulator, Tuple3<String, String, String>>) acc -> acc
|
||||
.getRels()
|
||||
.iterator(),
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING()));
|
||||
}
|
||||
|
||||
|
|
|
@ -7,6 +7,8 @@ import java.util.Optional;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
@ -22,6 +24,7 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
|
|||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class PrepareRelatedProjectsJob {
|
||||
|
||||
|
@ -60,20 +63,25 @@ public class PrepareRelatedProjectsJob {
|
|||
|
||||
final Dataset<OaBrokerProject> projects = ClusterUtils
|
||||
.readPath(spark, graphPath + "/project", Project.class)
|
||||
.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
|
||||
.map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class));
|
||||
.filter((FilterFunction<Project>) p -> !ClusterUtils.isDedupRoot(p.getId()))
|
||||
.map(
|
||||
(MapFunction<Project, OaBrokerProject>) ConversionUtils::oafProjectToBrokerProject,
|
||||
Encoders.bean(OaBrokerProject.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
|
||||
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
.filter((FilterFunction<Relation>) r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter((FilterFunction<Relation>) r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
|
||||
.filter((FilterFunction<Relation>) r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
final Dataset<RelatedProject> dataset = rels
|
||||
.joinWith(projects, projects.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> new RelatedProject(t._1.getSource(), t._2), Encoders.bean(RelatedProject.class));
|
||||
.map(
|
||||
(MapFunction<Tuple2<Relation, OaBrokerProject>, RelatedProject>) t -> new RelatedProject(
|
||||
t._1.getSource(), t._2),
|
||||
Encoders.bean(RelatedProject.class));
|
||||
|
||||
ClusterUtils.save(dataset, relsPath, RelatedProject.class, total);
|
||||
|
||||
|
|
|
@ -7,6 +7,8 @@ import java.util.Optional;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
@ -21,6 +23,7 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication;
|
|||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class PrepareRelatedPublicationsJob {
|
||||
|
||||
|
@ -59,22 +62,22 @@ public class PrepareRelatedPublicationsJob {
|
|||
|
||||
final Dataset<OaBrokerRelatedPublication> pubs = ClusterUtils
|
||||
.readPath(spark, graphPath + "/publication", Publication.class)
|
||||
.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
|
||||
.filter((FilterFunction<Publication>) p -> !ClusterUtils.isDedupRoot(p.getId()))
|
||||
.map(
|
||||
ConversionUtils::oafPublicationToBrokerPublication,
|
||||
(MapFunction<Publication, OaBrokerRelatedPublication>) ConversionUtils::oafPublicationToBrokerPublication,
|
||||
Encoders.bean(OaBrokerRelatedPublication.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
.filter((FilterFunction<Relation>) r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter((FilterFunction<Relation>) r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||
.filter((FilterFunction<Relation>) r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
|
||||
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
final Dataset<RelatedPublication> dataset = rels
|
||||
.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> {
|
||||
.map((MapFunction<Tuple2<Relation, OaBrokerRelatedPublication>, RelatedPublication>) t -> {
|
||||
final RelatedPublication rel = new RelatedPublication(
|
||||
t._1.getSource(), t._2);
|
||||
rel.getRelPublication().setRelType(t._1.getRelClass());
|
||||
|
|
|
@ -7,7 +7,10 @@ import java.util.Optional;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -22,6 +25,7 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware;
|
|||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class PrepareRelatedSoftwaresJob {
|
||||
|
||||
|
@ -58,22 +62,30 @@ public class PrepareRelatedSoftwaresJob {
|
|||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels");
|
||||
|
||||
final Encoder<OaBrokerRelatedSoftware> obrsEncoder = Encoders.bean(OaBrokerRelatedSoftware.class);
|
||||
final Dataset<OaBrokerRelatedSoftware> softwares = ClusterUtils
|
||||
.readPath(spark, graphPath + "/software", Software.class)
|
||||
.filter(sw -> !ClusterUtils.isDedupRoot(sw.getId()))
|
||||
.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));
|
||||
.filter((FilterFunction<Software>) sw -> !ClusterUtils.isDedupRoot(sw.getId()))
|
||||
.map(
|
||||
(MapFunction<Software, OaBrokerRelatedSoftware>) ConversionUtils::oafSoftwareToBrokerSoftware,
|
||||
obrsEncoder);
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
final Dataset<Relation> rels;
|
||||
rels = ClusterUtils
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
.filter((FilterFunction<Relation>) r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter((FilterFunction<Relation>) r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||
.filter((FilterFunction<Relation>) r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
final Encoder<RelatedSoftware> rsEncoder = Encoders.bean(RelatedSoftware.class);
|
||||
final Dataset<RelatedSoftware> dataset = rels
|
||||
.joinWith(softwares, softwares.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> new RelatedSoftware(t._1.getSource(), t._2), Encoders.bean(RelatedSoftware.class));
|
||||
.map(
|
||||
(MapFunction<Tuple2<Relation, OaBrokerRelatedSoftware>, RelatedSoftware>) t -> new RelatedSoftware(
|
||||
t._1.getSource(), t._2),
|
||||
rsEncoder);
|
||||
|
||||
ClusterUtils.save(dataset, relsPath, RelatedSoftware.class, total);
|
||||
|
||||
|
|
|
@ -7,7 +7,10 @@ import java.util.Optional;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
@ -73,11 +76,12 @@ public class PrepareSimpleEntititiesJob {
|
|||
final String graphPath,
|
||||
final Class<SRC> sourceClass) {
|
||||
|
||||
final Encoder<OaBrokerMainEntity> encoder = Encoders.bean(OaBrokerMainEntity.class);
|
||||
return ClusterUtils
|
||||
.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getId()))
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class));
|
||||
.filter((FilterFunction<SRC>) r -> !ClusterUtils.isDedupRoot(r.getId()))
|
||||
.filter((FilterFunction<SRC>) r -> r.getDataInfo().getDeletedbyinference())
|
||||
.map((MapFunction<SRC, OaBrokerMainEntity>) ConversionUtils::oafResultToBrokerResult, encoder);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue