diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index acac3594f..74f31cf35 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -21,6 +21,10 @@ org.apache.hadoop hadoop-common + + com.github.sisyphsu + dateparser + org.apache.spark spark-core_2.11 diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java deleted file mode 100644 index 59fe941ed..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java +++ /dev/null @@ -1,182 +0,0 @@ - -package eu.dnetlib.data.mdstore.manager.common.model; - -import java.io.Serializable; -import java.util.Date; -import java.util.Objects; -import java.util.UUID; - -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.Table; -import javax.persistence.Temporal; -import javax.persistence.TemporalType; - -@Entity -@Table(name = "mdstores") -public class MDStore implements Serializable { - - /** */ - private static final long serialVersionUID = 3160530489149700055L; - - @Id - @Column(name = "id") - private String id; - - @Column(name = "format") - private String format; - - @Column(name = "layout") - private String layout; - - @Column(name = "interpretation") - private String interpretation; - - @Column(name = "datasource_name") - private String datasourceName; - - @Column(name = "datasource_id") - private String datasourceId; - - @Column(name = "api_id") - private String apiId; - - @Column(name = "hdfs_path") - private String hdfsPath; - - @Column(name = "creation_date") - @Temporal(TemporalType.TIMESTAMP) - private Date creationDate; - - public String getId() { - return id; - } - - public void setId(final String id) { - this.id = id; - } - - public String getFormat() { - return format; - } - - public void setFormat(final String format) { - this.format = format; - } - - public String getLayout() { - return layout; - } - - public void setLayout(final String layout) { - this.layout = layout; - } - - public String getInterpretation() { - return interpretation; - } - - public void setInterpretation(final String interpretation) { - this.interpretation = interpretation; - } - - public String getDatasourceName() { - return datasourceName; - } - - public void setDatasourceName(final String datasourceName) { - this.datasourceName = datasourceName; - } - - public String getDatasourceId() { - return datasourceId; - } - - public void setDatasourceId(final String datasourceId) { - this.datasourceId = datasourceId; - } - - public String getApiId() { - return apiId; - } - - public void setApiId(final String apiId) { - this.apiId = apiId; - } - - public String getHdfsPath() { - return hdfsPath; - } - - public void setHdfsPath(final String hdfsPath) { - this.hdfsPath = hdfsPath; - } - - public Date getCreationDate() { - return creationDate; - } - - public void setCreationDate(final Date creationDate) { - this.creationDate = creationDate; - } - - public static MDStore newInstance( - final String format, - final String layout, - final String interpretation, - final String hdfsBasePath) { - return newInstance(format, layout, interpretation, null, null, null, hdfsBasePath); - } - - public static MDStore newInstance( - final String format, - final String layout, - final String interpretation, - final String dsName, - final String dsId, - final String apiId, - final String hdfsBasePath) { - - final String mdId = "md-" + UUID.randomUUID(); - - final MDStore md = new MDStore(); - md.setId(mdId); - md.setFormat(format); - md.setLayout(layout); - md.setInterpretation(interpretation); - md.setCreationDate(new Date()); - md.setDatasourceName(dsName); - md.setDatasourceId(dsId); - md.setApiId(apiId); - md.setHdfsPath(String.format("%s/%s", hdfsBasePath, mdId)); - - return md; - } - - @Override - public String toString() { - return String - .format( - "MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]", - id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate); - } - - @Override - public int hashCode() { - return Objects.hash(id); - } - - @Override - public boolean equals(final Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof MDStore)) { - return false; - } - final MDStore other = (MDStore) obj; - return Objects.equals(id, other.id); - } - -} diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java deleted file mode 100644 index d808e2de7..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java +++ /dev/null @@ -1,74 +0,0 @@ - -package eu.dnetlib.data.mdstore.manager.common.model; - -import java.io.Serializable; -import java.util.Objects; - -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.Table; - -@Entity -@Table(name = "mdstore_current_versions") -public class MDStoreCurrentVersion implements Serializable { - - /** */ - private static final long serialVersionUID = -4757725888593745773L; - - @Id - @Column(name = "mdstore") - private String mdstore; - - @Column(name = "current_version") - private String currentVersion; - - public String getMdstore() { - return mdstore; - } - - public void setMdstore(final String mdstore) { - this.mdstore = mdstore; - } - - public String getCurrentVersion() { - return currentVersion; - } - - public void setCurrentVersion(final String currentVersion) { - this.currentVersion = currentVersion; - } - - public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) { - final MDStoreCurrentVersion cv = new MDStoreCurrentVersion(); - cv.setMdstore(mdId); - cv.setCurrentVersion(versionId); - return cv; - } - - public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) { - return newInstance(v.getMdstore(), v.getId()); - } - - @Override - public String toString() { - return String.format("MDStoreCurrentVersion [mdstore=%s, currentVersion=%s]", mdstore, currentVersion); - } - - @Override - public int hashCode() { - return Objects.hash(currentVersion, mdstore); - } - - @Override - public boolean equals(final Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof MDStoreCurrentVersion)) { - return false; - } - final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj; - return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore); - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java deleted file mode 100644 index 38f8f275e..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java +++ /dev/null @@ -1,140 +0,0 @@ - -package eu.dnetlib.data.mdstore.manager.common.model; - -import java.io.Serializable; -import java.util.Date; -import java.util.Objects; - -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.Table; -import javax.persistence.Temporal; -import javax.persistence.TemporalType; - -@Entity -@Table(name = "mdstore_versions") -public class MDStoreVersion implements Serializable { - - /** */ - private static final long serialVersionUID = -4763494442274298339L; - - @Id - @Column(name = "id") - private String id; - - @Column(name = "mdstore") - private String mdstore; - - @Column(name = "writing") - private boolean writing; - - @Column(name = "readcount") - private int readCount = 0; - - @Column(name = "lastupdate") - @Temporal(TemporalType.TIMESTAMP) - private Date lastUpdate; - - @Column(name = "size") - private long size = 0; - - @Column(name = "hdfs_path") - private String hdfsPath; - - public static MDStoreVersion newInstance(final String mdId, final boolean writing, final String hdfsBasePath) { - final MDStoreVersion v = new MDStoreVersion(); - - final String versionId = mdId + "-" + new Date().getTime(); - v.setId(versionId); - v.setMdstore(mdId); - v.setLastUpdate(null); - v.setWriting(writing); - v.setReadCount(0); - v.setSize(0); - v.setHdfsPath(String.format("%s/%s/%s", hdfsBasePath, mdId, versionId)); - - return v; - } - - public String getId() { - return id; - } - - public void setId(final String id) { - this.id = id; - } - - public String getMdstore() { - return mdstore; - } - - public void setMdstore(final String mdstore) { - this.mdstore = mdstore; - } - - public boolean isWriting() { - return writing; - } - - public void setWriting(final boolean writing) { - this.writing = writing; - } - - public int getReadCount() { - return readCount; - } - - public void setReadCount(final int readCount) { - this.readCount = readCount; - } - - public Date getLastUpdate() { - return lastUpdate; - } - - public void setLastUpdate(final Date lastUpdate) { - this.lastUpdate = lastUpdate; - } - - public long getSize() { - return size; - } - - public void setSize(final long size) { - this.size = size; - } - - public String getHdfsPath() { - return hdfsPath; - } - - public void setHdfsPath(final String hdfsPath) { - this.hdfsPath = hdfsPath; - } - - @Override - public String toString() { - return String - .format( - "MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id, - mdstore, writing, readCount, lastUpdate, size, hdfsPath); - } - - @Override - public int hashCode() { - return Objects.hash(id); - } - - @Override - public boolean equals(final Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof MDStoreVersion)) { - return false; - } - final MDStoreVersion other = (MDStoreVersion) obj; - return Objects.equals(id, other.id); - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java deleted file mode 100644 index 510c65092..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java +++ /dev/null @@ -1,194 +0,0 @@ - -package eu.dnetlib.data.mdstore.manager.common.model; - -import java.io.Serializable; -import java.util.Date; -import java.util.Objects; - -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.Table; -import javax.persistence.Temporal; -import javax.persistence.TemporalType; - -@Entity -@Table(name = "mdstores_with_info") -public class MDStoreWithInfo implements Serializable { - - /** */ - private static final long serialVersionUID = -8445784770687571492L; - - @Id - @Column(name = "id") - private String id; - - @Column(name = "format") - private String format; - - @Column(name = "layout") - private String layout; - - @Column(name = "interpretation") - private String interpretation; - - @Column(name = "datasource_name") - private String datasourceName; - - @Column(name = "datasource_id") - private String datasourceId; - - @Column(name = "api_id") - private String apiId; - - @Column(name = "current_version") - private String currentVersion; - - @Column(name = "creation_date") - @Temporal(TemporalType.TIMESTAMP) - private Date creationDate; - - @Column(name = "lastupdate") - @Temporal(TemporalType.TIMESTAMP) - private Date lastUpdate; - - @Column(name = "size") - private long size = 0; - - @Column(name = "n_versions") - private long numberOfVersions = 0; - - @Column(name = "hdfs_path") - private String hdfsPath; - - public String getId() { - return id; - } - - public void setId(final String id) { - this.id = id; - } - - public String getFormat() { - return format; - } - - public void setFormat(final String format) { - this.format = format; - } - - public String getLayout() { - return layout; - } - - public void setLayout(final String layout) { - this.layout = layout; - } - - public String getInterpretation() { - return interpretation; - } - - public void setInterpretation(final String interpretation) { - this.interpretation = interpretation; - } - - public String getDatasourceName() { - return datasourceName; - } - - public void setDatasourceName(final String datasourceName) { - this.datasourceName = datasourceName; - } - - public String getDatasourceId() { - return datasourceId; - } - - public void setDatasourceId(final String datasourceId) { - this.datasourceId = datasourceId; - } - - public String getApiId() { - return apiId; - } - - public void setApiId(final String apiId) { - this.apiId = apiId; - } - - public String getCurrentVersion() { - return currentVersion; - } - - public void setCurrentVersion(final String currentVersion) { - this.currentVersion = currentVersion; - } - - public Date getCreationDate() { - return creationDate; - } - - public void setCreationDate(final Date creationDate) { - this.creationDate = creationDate; - } - - public Date getLastUpdate() { - return lastUpdate; - } - - public void setLastUpdate(final Date lastUpdate) { - this.lastUpdate = lastUpdate; - } - - public long getSize() { - return size; - } - - public void setSize(final long size) { - this.size = size; - } - - public long getNumberOfVersions() { - return numberOfVersions; - } - - public void setNumberOfVersions(final long numberOfVersions) { - this.numberOfVersions = numberOfVersions; - } - - public String getHdfsPath() { - return hdfsPath; - } - - public void setHdfsPath(final String hdfsPath) { - this.hdfsPath = hdfsPath; - } - - @Override - public String toString() { - return String - .format( - "MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]", - id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate, - lastUpdate, size, numberOfVersions, hdfsPath); - } - - @Override - public int hashCode() { - return Objects.hash(id); - } - - @Override - public boolean equals(final Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof MDStoreWithInfo)) { - return false; - } - final MDStoreWithInfo other = (MDStoreWithInfo) obj; - return Objects.equals(id, other.id); - } - -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java index cedc9bd4d..fabb25f16 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java @@ -14,7 +14,7 @@ public class DbClient implements Closeable { private static final Log log = LogFactory.getLog(DbClient.class); - private Connection connection; + private final Connection connection; public DbClient(final String address, final String login, final String password) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java index 76017d5b7..7dc0e4417 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java @@ -100,7 +100,7 @@ public class MakeTarArchive implements Serializable { BufferedInputStream bis = new BufferedInputStream(is); int count; - byte data[] = new byte[1024]; + byte[] data = new byte[1024]; while ((count = bis.read(data, 0, data.length)) != -1) { ar.write(data, 0, count); } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java index c3f393436..c127783e5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java @@ -13,9 +13,9 @@ import okio.Source; public class InputStreamRequestBody extends RequestBody { - private InputStream inputStream; - private MediaType mediaType; - private long lenght; + private final InputStream inputStream; + private final MediaType mediaType; + private final long lenght; public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java index 853d22bc2..98dabf56a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java @@ -21,7 +21,7 @@ public class DNetRestClient { private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class); - private static ObjectMapper mapper = new ObjectMapper(); + private static final ObjectMapper mapper = new ObjectMapper(); public static T doGET(final String url, Class clazz) throws Exception { final HttpGet httpGet = new HttpGet(url); diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java index 0c6eacf99..deeda9beb 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java @@ -34,7 +34,7 @@ public class MessageSender { private final String workflowId; - private ExecutorService executorService = Executors.newCachedThreadPool(); + private final ExecutorService executorService = Executors.newCachedThreadPool(); public MessageSender(final String dnetMessageEndpoint, final String workflowId) { this.workflowId = workflowId; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java new file mode 100644 index 000000000..a75cc52e6 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -0,0 +1,459 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.util.*; +import java.util.function.Function; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.commons.lang3.StringUtils; +import org.jetbrains.annotations.NotNull; + +import com.github.sisyphsu.dateparser.DateParserUtils; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; + +public class GraphCleaningFunctions extends CleaningFunctions { + + public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})"; + public static final int ORCID_LEN = 19; + public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; + public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*"; + public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]"; + public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10; + + public static T fixVocabularyNames(T value) { + if (value instanceof Datasource) { + // nothing to clean here + } else if (value instanceof Project) { + // nothing to clean here + } else if (value instanceof Organization) { + Organization o = (Organization) value; + if (Objects.nonNull(o.getCountry())) { + fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE); + } + } else if (value instanceof Relation) { + // nothing to clean here + } else if (value instanceof Result) { + + Result r = (Result) value; + + fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES); + fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE); + fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES); + + if (Objects.nonNull(r.getSubject())) { + r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES)); + } + if (Objects.nonNull(r.getInstance())) { + for (Instance i : r.getInstance()) { + fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES); + fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS); + } + } + if (Objects.nonNull(r.getAuthor())) { + r.getAuthor().stream().filter(Objects::nonNull).forEach(a -> { + if (Objects.nonNull(a.getPid())) { + a.getPid().stream().filter(Objects::nonNull).forEach(p -> { + fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES); + }); + } + }); + } + if (value instanceof Publication) { + + } else if (value instanceof Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + } + + return value; + } + + public static boolean filter(T value) { + if (value instanceof Datasource) { + // nothing to evaluate here + } else if (value instanceof Project) { + // nothing to evaluate here + } else if (value instanceof Organization) { + // nothing to evaluate here + } else if (value instanceof Relation) { + // nothing to clean here + } else if (value instanceof Result) { + + Result r = (Result) value; + + if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) { + return false; + } + + if (value instanceof Publication) { + + } else if (value instanceof Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + } + return true; + } + + public static T cleanup(T value) { + if (value instanceof Datasource) { + // nothing to clean here + } else if (value instanceof Project) { + // nothing to clean here + } else if (value instanceof Organization) { + Organization o = (Organization) value; + if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) { + o.setCountry(ModelConstants.UNKNOWN_COUNTRY); + } + } else if (value instanceof Relation) { + Relation r = (Relation) value; + + Optional validationDate = doCleanDate(r.getValidationDate()); + if (validationDate.isPresent()) { + r.setValidationDate(validationDate.get()); + r.setValidated(true); + } else { + r.setValidationDate(null); + r.setValidated(false); + } + } else if (value instanceof Result) { + + Result r = (Result) value; + + if (Objects.nonNull(r.getDateofacceptance())) { + Optional date = cleanDateField(r.getDateofacceptance()); + if (date.isPresent()) { + r.getDateofacceptance().setValue(date.get()); + } else { + r.setDateofacceptance(null); + } + } + if (Objects.nonNull(r.getRelevantdate())) { + r + .setRelevantdate( + r + .getRelevantdate() + .stream() + .filter(Objects::nonNull) + .filter(sp -> Objects.nonNull(sp.getQualifier())) + .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .map(sp -> { + sp.setValue(GraphCleaningFunctions.cleanDate(sp.getValue())); + return sp; + }) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) { + r.setPublisher(null); + } + if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { + r + .setLanguage( + qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES)); + } + if (Objects.nonNull(r.getSubject())) { + r + .setSubject( + r + .getSubject() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .filter(sp -> Objects.nonNull(sp.getQualifier())) + .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .map(GraphCleaningFunctions::cleanValue) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getTitle())) { + r + .setTitle( + r + .getTitle() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .filter( + sp -> sp + .getValue() + .toLowerCase() + .replaceAll(TITLE_FILTER_REGEX, "") + .length() > TITLE_FILTER_RESIDUAL_LENGTH) + .map(GraphCleaningFunctions::cleanValue) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getDescription())) { + r + .setDescription( + r + .getDescription() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .map(GraphCleaningFunctions::cleanValue) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getPid())) { + r.setPid(processPidCleaning(r.getPid())); + } + if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { + r + .setResourcetype( + qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); + } + if (Objects.nonNull(r.getInstance())) { + + for (Instance i : r.getInstance()) { + if (Objects.nonNull(i.getPid())) { + i.setPid(processPidCleaning(i.getPid())); + } + if (Objects.nonNull(i.getAlternateIdentifier())) { + i.setAlternateIdentifier(processPidCleaning(i.getAlternateIdentifier())); + } + Optional + .ofNullable(i.getPid()) + .ifPresent(pid -> { + final Set pids = Sets.newHashSet(pid); + Optional + .ofNullable(i.getAlternateIdentifier()) + .ifPresent(altId -> { + final Set altIds = Sets.newHashSet(altId); + i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids))); + }); + }); + + if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { + i + .setAccessright( + accessRight( + ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, + ModelConstants.DNET_ACCESS_MODES)); + } + if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { + i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); + } + if (Objects.isNull(i.getRefereed())) { + i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS)); + } + if (Objects.nonNull(i.getDateofacceptance())) { + Optional date = cleanDateField(i.getDateofacceptance()); + if (date.isPresent()) { + i.getDateofacceptance().setValue(date.get()); + } else { + i.setDateofacceptance(null); + } + } + } + } + if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) { + Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance()); + if (Objects.isNull(bestaccessrights)) { + r + .setBestaccessright( + qualifier( + ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, + ModelConstants.DNET_ACCESS_MODES)); + } else { + r.setBestaccessright(bestaccessrights); + } + } + if (Objects.nonNull(r.getAuthor())) { + r + .setAuthor( + r + .getAuthor() + .stream() + .filter(a -> Objects.nonNull(a)) + .filter(a -> StringUtils.isNotBlank(a.getFullname())) + .filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", ""))) + .collect(Collectors.toList())); + + boolean nullRank = r + .getAuthor() + .stream() + .anyMatch(a -> Objects.isNull(a.getRank())); + if (nullRank) { + int i = 1; + for (Author author : r.getAuthor()) { + author.setRank(i++); + } + } + + for (Author a : r.getAuthor()) { + if (Objects.isNull(a.getPid())) { + a.setPid(Lists.newArrayList()); + } else { + a + .setPid( + a + .getPid() + .stream() + .filter(Objects::nonNull) + .filter(p -> Objects.nonNull(p.getQualifier())) + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .map(p -> { + // hack to distinguish orcid from orcid_pending + String pidProvenance = Optional + .ofNullable(p.getDataInfo()) + .map( + d -> Optional + .ofNullable(d.getProvenanceaction()) + .map(Qualifier::getClassid) + .orElse("")) + .orElse(""); + if (p + .getQualifier() + .getClassid() + .toLowerCase() + .contains(ModelConstants.ORCID)) { + if (pidProvenance + .equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) { + p.getQualifier().setClassid(ModelConstants.ORCID); + } else { + p.getQualifier().setClassid(ModelConstants.ORCID_PENDING); + } + final String orcid = p + .getValue() + .trim() + .toLowerCase() + .replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4"); + if (orcid.length() == ORCID_LEN) { + p.setValue(orcid); + } else { + p.setValue(""); + } + } + return p; + }) + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .collect( + Collectors + .toMap( + p -> p.getQualifier().getClassid() + p.getValue(), + Function.identity(), + (p1, p2) -> p1, + LinkedHashMap::new)) + .values() + .stream() + .collect(Collectors.toList())); + } + } + } + if (value instanceof Publication) { + + } else if (value instanceof Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + } + + return value; + } + + private static Optional cleanDateField(Field dateofacceptance) { + return Optional + .ofNullable(dateofacceptance) + .map(Field::getValue) + .map(GraphCleaningFunctions::cleanDate) + .filter(Objects::nonNull); + } + + protected static Optional doCleanDate(String date) { + return Optional.ofNullable(cleanDate(date)); + } + + public static String cleanDate(final String inputDate) { + + if (StringUtils.isBlank(inputDate)) { + return null; + } + + try { + final LocalDate date = DateParserUtils + .parseDate(inputDate.trim()) + .toInstant() + .atZone(ZoneId.systemDefault()) + .toLocalDate(); + return DateTimeFormatter.ofPattern(ModelSupport.DATE_FORMAT).format(date); + } catch (DateTimeParseException e) { + return null; + } + } + + // HELPERS + + private static boolean isValidAuthorName(Author a) { + return !Stream + .of(a.getFullname(), a.getName(), a.getSurname()) + .filter(s -> s != null && !s.isEmpty()) + .collect(Collectors.joining("")) + .toLowerCase() + .matches(INVALID_AUTHOR_REGEX); + } + + private static List processPidCleaning(List pids) { + return pids + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue()))) + .filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase())) + .filter(sp -> Objects.nonNull(sp.getQualifier())) + .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .map(CleaningFunctions::normalizePidValue) + .filter(CleaningFunctions::pidFilter) + .collect(Collectors.toList()); + } + + private static void fixVocabName(Qualifier q, String vocabularyName) { + if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) { + q.setSchemeid(vocabularyName); + q.setSchemename(vocabularyName); + } + } + + private static AccessRight accessRight(String classid, String classname, String scheme) { + return OafMapperUtils + .accessRight( + classid, classname, scheme, scheme); + } + + private static Qualifier qualifier(String classid, String classname, String scheme) { + return OafMapperUtils + .qualifier( + classid, classname, scheme, scheme); + } + + protected static StructuredProperty cleanValue(StructuredProperty s) { + s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); + return s; + } + + protected static Field cleanValue(Field s) { + s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); + return s; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java new file mode 100644 index 000000000..c6a8fd5a7 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java @@ -0,0 +1,368 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static eu.dnetlib.dhp.schema.common.ModelConstants.*; + +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.schema.common.AccessRightComparator; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; + +public class OafMapperUtils { + + public static Oaf merge(final Oaf left, final Oaf right) { + if (ModelSupport.isSubClass(left, OafEntity.class)) { + return mergeEntities((OafEntity) left, (OafEntity) right); + } else if (ModelSupport.isSubClass(left, Relation.class)) { + ((Relation) left).mergeFrom((Relation) right); + } else { + throw new RuntimeException("invalid Oaf type:" + left.getClass().getCanonicalName()); + } + return left; + } + + public static OafEntity mergeEntities(OafEntity left, OafEntity right) { + if (ModelSupport.isSubClass(left, Result.class)) { + return mergeResults((Result) left, (Result) right); + } else if (ModelSupport.isSubClass(left, Datasource.class)) { + left.mergeFrom(right); + } else if (ModelSupport.isSubClass(left, Organization.class)) { + left.mergeFrom(right); + } else if (ModelSupport.isSubClass(left, Project.class)) { + left.mergeFrom(right); + } else { + throw new RuntimeException("invalid OafEntity subtype:" + left.getClass().getCanonicalName()); + } + return left; + } + + public static Result mergeResults(Result left, Result right) { + if (new ResultTypeComparator().compare(left, right) < 0) { + left.mergeFrom(right); + return left; + } else { + right.mergeFrom(left); + return right; + } + } + + public static KeyValue keyValue(final String k, final String v) { + final KeyValue kv = new KeyValue(); + kv.setKey(k); + kv.setValue(v); + return kv; + } + + public static List listKeyValues(final String... s) { + if (s.length % 2 > 0) { + throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); + } + + final List list = new ArrayList<>(); + for (int i = 0; i < s.length; i += 2) { + list.add(keyValue(s[i], s[i + 1])); + } + return list; + } + + public static Field field(final T value, final DataInfo info) { + if (value == null || StringUtils.isBlank(value.toString())) { + return null; + } + + final Field field = new Field<>(); + field.setValue(value); + field.setDataInfo(info); + return field; + } + + public static List> listFields(final DataInfo info, final String... values) { + return Arrays + .stream(values) + .map(v -> field(v, info)) + .filter(Objects::nonNull) + .filter(distinctByKey(f -> f.getValue())) + .collect(Collectors.toList()); + } + + public static List> listFields(final DataInfo info, final List values) { + return values + .stream() + .map(v -> field(v, info)) + .filter(Objects::nonNull) + .filter(distinctByKey(f -> f.getValue())) + .collect(Collectors.toList()); + } + + public static Qualifier unknown(final String schemeid, final String schemename) { + return qualifier("UNKNOWN", "Unknown", schemeid, schemename); + } + + public static AccessRight accessRight( + final String classid, + final String classname, + final String schemeid, + final String schemename) { + return accessRight(classid, classname, schemeid, schemename, null); + } + + public static AccessRight accessRight( + final String classid, + final String classname, + final String schemeid, + final String schemename, + final OpenAccessRoute openAccessRoute) { + final AccessRight accessRight = new AccessRight(); + accessRight.setClassid(classid); + accessRight.setClassname(classname); + accessRight.setSchemeid(schemeid); + accessRight.setSchemename(schemename); + accessRight.setOpenAccessRoute(openAccessRoute); + return accessRight; + } + + public static Qualifier qualifier( + final String classid, + final String classname, + final String schemeid, + final String schemename) { + final Qualifier q = new Qualifier(); + q.setClassid(classid); + q.setClassname(classname); + q.setSchemeid(schemeid); + q.setSchemename(schemename); + return q; + } + + public static Qualifier qualifier(final Qualifier qualifier) { + final Qualifier q = new Qualifier(); + q.setClassid(qualifier.getClassid()); + q.setClassname(qualifier.getClassname()); + q.setSchemeid(qualifier.getSchemeid()); + q.setSchemename(qualifier.getSchemename()); + return q; + } + + public static StructuredProperty structuredProperty( + final String value, + final String classid, + final String classname, + final String schemeid, + final String schemename, + final DataInfo dataInfo) { + + return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); + } + + public static StructuredProperty structuredProperty( + final String value, + final Qualifier qualifier, + final DataInfo dataInfo) { + if (value == null) { + return null; + } + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(value); + sp.setQualifier(qualifier); + sp.setDataInfo(dataInfo); + return sp; + } + + public static ExtraInfo extraInfo( + final String name, + final String value, + final String typology, + final String provenance, + final String trust) { + final ExtraInfo info = new ExtraInfo(); + info.setName(name); + info.setValue(value); + info.setTypology(typology); + info.setProvenance(provenance); + info.setTrust(trust); + return info; + } + + public static OAIProvenance oaiIProvenance( + final String identifier, + final String baseURL, + final String metadataNamespace, + final Boolean altered, + final String datestamp, + final String harvestDate) { + + final OriginDescription desc = new OriginDescription(); + desc.setIdentifier(identifier); + desc.setBaseURL(baseURL); + desc.setMetadataNamespace(metadataNamespace); + desc.setAltered(altered); + desc.setDatestamp(datestamp); + desc.setHarvestDate(harvestDate); + + final OAIProvenance p = new OAIProvenance(); + p.setOriginDescription(desc); + + return p; + } + + public static Journal journal( + final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final DataInfo dataInfo) { + + return hasIssn(issnPrinted, issnOnline, issnLinking) ? journal( + name, + issnPrinted, + issnOnline, + issnLinking, + null, + null, + null, + null, + null, + null, + null, + dataInfo) : null; + } + + public static Journal journal( + final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final String ep, + final String iss, + final String sp, + final String vol, + final String edition, + final String conferenceplace, + final String conferencedate, + final DataInfo dataInfo) { + + if (StringUtils.isNotBlank(name) || hasIssn(issnPrinted, issnOnline, issnLinking)) { + final Journal j = new Journal(); + j.setName(name); + j.setIssnPrinted(issnPrinted); + j.setIssnOnline(issnOnline); + j.setIssnLinking(issnLinking); + j.setEp(ep); + j.setIss(iss); + j.setSp(sp); + j.setVol(vol); + j.setEdition(edition); + j.setConferenceplace(conferenceplace); + j.setConferencedate(conferencedate); + j.setDataInfo(dataInfo); + return j; + } else { + return null; + } + } + + private static boolean hasIssn(String issnPrinted, String issnOnline, String issnLinking) { + return StringUtils.isNotBlank(issnPrinted) + || StringUtils.isNotBlank(issnOnline) + || StringUtils.isNotBlank(issnLinking); + } + + public static DataInfo dataInfo( + final Boolean deletedbyinference, + final String inferenceprovenance, + final Boolean inferred, + final Boolean invisible, + final Qualifier provenanceaction, + final String trust) { + final DataInfo d = new DataInfo(); + d.setDeletedbyinference(deletedbyinference); + d.setInferenceprovenance(inferenceprovenance); + d.setInferred(inferred); + d.setInvisible(invisible); + d.setProvenanceaction(provenanceaction); + d.setTrust(trust); + return d; + } + + public static String createOpenaireId( + final int prefix, + final String originalId, + final boolean to_md5) { + if (StringUtils.isBlank(originalId)) { + return null; + } else if (to_md5) { + final String nsPrefix = StringUtils.substringBefore(originalId, "::"); + final String rest = StringUtils.substringAfter(originalId, "::"); + return String.format("%s|%s::%s", prefix, nsPrefix, IdentifierFactory.md5(rest)); + } else { + return String.format("%s|%s", prefix, originalId); + } + } + + public static String createOpenaireId( + final String type, + final String originalId, + final boolean to_md5) { + switch (type) { + case "datasource": + return createOpenaireId(10, originalId, to_md5); + case "organization": + return createOpenaireId(20, originalId, to_md5); + case "person": + return createOpenaireId(30, originalId, to_md5); + case "project": + return createOpenaireId(40, originalId, to_md5); + default: + return createOpenaireId(50, originalId, to_md5); + } + } + + public static String asString(final Object o) { + return o == null ? "" : o.toString(); + } + + public static Predicate distinctByKey( + final Function keyExtractor) { + final Map seen = new ConcurrentHashMap<>(); + return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null; + } + + public static Qualifier createBestAccessRights(final List instanceList) { + return getBestAccessRights(instanceList); + } + + protected static Qualifier getBestAccessRights(final List instanceList) { + if (instanceList != null) { + final Optional min = instanceList + .stream() + .map(i -> i.getAccessright()) + .min(new AccessRightComparator<>()); + + final Qualifier rights = min.isPresent() ? qualifier(min.get()) : new Qualifier(); + + if (StringUtils.isBlank(rights.getClassid())) { + rights.setClassid(UNKNOWN); + } + if (StringUtils.isBlank(rights.getClassname()) + || UNKNOWN.equalsIgnoreCase(rights.getClassname())) { + rights.setClassname(NOT_AVAILABLE); + } + if (StringUtils.isBlank(rights.getSchemeid())) { + rights.setSchemeid(DNET_ACCESS_MODES); + } + if (StringUtils.isBlank(rights.getSchemename())) { + rights.setSchemename(DNET_ACCESS_MODES); + } + + return rights; + } + return null; + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java index 9552eb2b3..b326c4159 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java @@ -1,11 +1,11 @@ package eu.dnetlib.dhp.utils; -import java.util.Map; - -import javax.xml.ws.BindingProvider; - +import org.apache.cxf.endpoint.Client; +import org.apache.cxf.frontend.ClientProxy; import org.apache.cxf.jaxws.JaxWsProxyFactoryBean; +import org.apache.cxf.transport.http.HTTPConduit; +import org.apache.cxf.transports.http.configuration.HTTPClientPolicy; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -15,8 +15,8 @@ public class ISLookupClientFactory { private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class); - private static int requestTimeout = 60000 * 10; - private static int connectTimeout = 60000 * 10; + private static final int requestTimeout = 60000 * 10; + private static final int connectTimeout = 60000 * 10; public static ISLookUpService getLookUpService(final String isLookupUrl) { return getServiceStub(ISLookUpService.class, isLookupUrl); @@ -31,20 +31,23 @@ public class ISLookupClientFactory { final T service = (T) jaxWsProxyFactory.create(); - if (service instanceof BindingProvider) { + Client client = ClientProxy.getClient(service); + if (client != null) { + HTTPConduit conduit = (HTTPConduit) client.getConduit(); + HTTPClientPolicy policy = new HTTPClientPolicy(); + log .info( - "setting timeouts for {} to requestTimeout: {}, connectTimeout: {}", - BindingProvider.class.getName(), requestTimeout, connectTimeout); + String + .format( + "setting connectTimeout to %s, requestTimeout to %s for service %s", + connectTimeout, + requestTimeout, + clazz.getCanonicalName())); - Map requestContext = ((BindingProvider) service).getRequestContext(); - - requestContext.put("com.sun.xml.internal.ws.request.timeout", requestTimeout); - requestContext.put("com.sun.xml.internal.ws.connect.timeout", connectTimeout); - requestContext.put("com.sun.xml.ws.request.timeout", requestTimeout); - requestContext.put("com.sun.xml.ws.connect.timeout", connectTimeout); - requestContext.put("javax.xml.ws.client.receiveTimeout", requestTimeout); - requestContext.put("javax.xml.ws.client.connectionTimeout", connectTimeout); + policy.setConnectionTimeout(connectTimeout); + policy.setReceiveTimeout(requestTimeout); + conduit.setClient(policy); } return service; diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java new file mode 100644 index 000000000..eefa1e9a3 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java @@ -0,0 +1,180 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.IOException; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Optional; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; + +public class OafMapperUtilsTest { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + + @Test + public void testDateValidation() { + + assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z ").isPresent()); + assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent()); + assertTrue(GraphCleaningFunctions.doCleanDate(" 2016-04-05").isPresent()); + + assertEquals("2016-04-05", GraphCleaningFunctions.doCleanDate("2016 Apr 05").get()); + + assertEquals("2009-05-08", GraphCleaningFunctions.doCleanDate("May 8, 2009 5:57:51 PM").get()); + assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, 1970").get()); + assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, '70").get()); + assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 1970").get()); + assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 70").get()); + assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 2006").get()); + assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 MST 2006").get()); + assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 02 15:04:05 -0700 2006").get()); + assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Monday, 02-Jan-06 15:04:05 MST").get()); + assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 MST").get()); + assertEquals("2017-07-11", GraphCleaningFunctions.doCleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)").get()); + assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 -0700").get()); + assertEquals("2018-01-04", GraphCleaningFunctions.doCleanDate("Thu, 4 Jan 2018 17:53:36 +0000").get()); + assertEquals("2015-08-10", GraphCleaningFunctions.doCleanDate("Mon Aug 10 15:44:11 UTC+0100 2015").get()); + assertEquals( + "2015-07-03", + GraphCleaningFunctions.doCleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)").get()); + assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 10:09am").get()); + assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 at 10:09am PST-08").get()); + assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012, 10:10:09").get()); + assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7, 1970").get()); + assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7th, 1970").get()); + assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006, 19:17").get()); + assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006 19:17").get()); + assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 70").get()); + assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 1970").get()); + assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("03 February 2013").get()); + assertEquals("2013-07-01", GraphCleaningFunctions.doCleanDate("1 July 2013").get()); + assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("2013-Feb-03").get()); + assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3/31/2014").get()); + assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03/31/2014").get()); + assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08/21/71").get()); + assertEquals("1971-01-08", GraphCleaningFunctions.doCleanDate("8/1/71").get()); + assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/2014 22:05").get()); + assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("04/08/2014 22:05").get()); + assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/14 22:05").get()); + assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("04/2/2014 03:00:51").get()); + assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00:00 AM").get()); + assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00:01 PM").get()); + assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00 PM").get()); + assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 1:00 PM").get()); + assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00 AM").get()); + assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("4/02/2014 03:00:51").get()); + assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59").get()); + assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59.3186369").get()); + assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/3/31").get()); + assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/03/31").get()); + assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/4/8 22:05").get()); + assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/04/08 22:05").get()); + assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/04/2 03:00:51").get()); + assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/4/02 03:00:51").get()); + assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59").get()); + assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59.3186369").get()); + assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014年04月08日").get()); + assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get()); + assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get()); + assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get()); + assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get()); + assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get()); + assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get()); + assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get()); + assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43").get()); + assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43:22").get()); + assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 UTC").get()); + assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 GMT").get()); + assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 05:24:37 PM").get()); + assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800").get()); + assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800 +08").get()); + assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:44 +09:00").get()); + assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000 +0000 UTC").get()); + assertEquals("2015-09-30", GraphCleaningFunctions.doCleanDate("2015-09-30 18:48:56.35272715 +0000 UTC").get()); + assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 GMT").get()); + assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 UTC").get()); + assertEquals( + "2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001").get()); + assertEquals( + "2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001").get()); + assertEquals("2017-07-19", GraphCleaningFunctions.doCleanDate("2017-07-19 03:21:51+00:00").get()); + assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26").get()); + assertEquals("2014-04-01", GraphCleaningFunctions.doCleanDate("2014-04").get()); + assertEquals("2014-01-01", GraphCleaningFunctions.doCleanDate("2014").get()); + assertEquals("2014-05-11", GraphCleaningFunctions.doCleanDate("2014-05-11 08:20:13,787").get()); + assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3.31.2014").get()); + assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03.31.2014").get()); + assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08.21.71").get()); + assertEquals("2014-03-01", GraphCleaningFunctions.doCleanDate("2014.03").get()); + assertEquals("2014-03-30", GraphCleaningFunctions.doCleanDate("2014.03.30").get()); + assertEquals("2014-06-01", GraphCleaningFunctions.doCleanDate("20140601").get()); + assertEquals("2014-07-22", GraphCleaningFunctions.doCleanDate("20140722105203").get()); + assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("1332151919").get()); + assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367189").get()); + assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222").get()); + assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222333").get()); + + } + + @Test + public void testDate() { + System.out.println(GraphCleaningFunctions.cleanDate("23-FEB-1998")); + } + + @Test + public void testMergePubs() throws IOException { + Publication p1 = read("publication_1.json", Publication.class); + Publication p2 = read("publication_2.json", Publication.class); + Dataset d1 = read("dataset_1.json", Dataset.class); + Dataset d2 = read("dataset_2.json", Dataset.class); + + assertEquals(p1.getCollectedfrom().size(), 1); + assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID); + assertEquals(d2.getCollectedfrom().size(), 1); + assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); + + assertTrue( + OafMapperUtils + .mergeResults(p1, d2) + .getResulttype() + .getClassid() + .equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID)); + + assertEquals(p2.getCollectedfrom().size(), 1); + assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); + assertEquals(d1.getCollectedfrom().size(), 1); + assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); + + assertTrue( + OafMapperUtils + .mergeResults(p2, d1) + .getResulttype() + .getClassid() + .equals(ModelConstants.DATASET_RESULTTYPE_CLASSID)); + } + + protected HashSet cfId(List collectedfrom) { + return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new)); + } + + protected T read(String filename, Class clazz) throws IOException { + final String json = IOUtils.toString(getClass().getResourceAsStream(filename)); + return OBJECT_MAPPER.readValue(json, clazz); + } + +} diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_1.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_1.json new file mode 100644 index 000000000..e38c4d1cc --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_1.json @@ -0,0 +1 @@ +{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_2.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_2.json new file mode 100644 index 000000000..52e4e126a --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_2.json @@ -0,0 +1 @@ +{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_1.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_1.json new file mode 100644 index 000000000..704c5ad4d --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_1.json @@ -0,0 +1 @@ +{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_2.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_2.json new file mode 100644 index 000000000..a1744e84e --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_2.json @@ -0,0 +1 @@ +{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository A"} ]} \ No newline at end of file diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java index 0f0d21e11..5a80c0b53 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java @@ -3,20 +3,23 @@ package eu.dnetlib.dhp.actionmanager; import java.io.Serializable; import java.io.StringReader; -import java.util.ArrayList; -import java.util.List; -import java.util.NoSuchElementException; +import java.util.*; import java.util.stream.Collectors; +import org.apache.commons.lang3.tuple.Triple; import org.dom4j.Document; +import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import eu.dnetlib.actionmanager.rmi.ActionManagerException; import eu.dnetlib.actionmanager.set.ActionManagerSet; @@ -25,6 +28,7 @@ import eu.dnetlib.dhp.actionmanager.partition.PartitionActionSetsByPayloadTypeJo import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import scala.Tuple2; public class ISClient implements Serializable { @@ -40,80 +44,52 @@ public class ISClient implements Serializable { public List getLatestRawsetPaths(String setIds) { - List ids = Lists - .newArrayList( + final Set ids = Sets + .newHashSet( Splitter .on(INPUT_ACTION_SET_ID_SEPARATOR) .omitEmptyStrings() .trimResults() .split(setIds)); - - return ids - .stream() - .map(id -> getSet(isLookup, id)) - .map(as -> as.getPathToLatest()) - .collect(Collectors.toCollection(ArrayList::new)); - } - - private ActionManagerSet getSet(ISLookUpService isLookup, final String setId) { - - final String q = "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') " - + "where $x//SET/@id = '" - + setId - + "' return $x"; - try { final String basePath = getBasePathHDFS(isLookup); - final String setProfile = isLookup.getResourceProfileByQuery(q); - return getActionManagerSet(basePath, setProfile); - } catch (ISLookUpException | ActionManagerException e) { - throw new RuntimeException("Error accessing Sets, using query: " + q); + + // + final String xquery = "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') " + + + "return "; + return Optional + .ofNullable(isLookup.quickSearchProfile(xquery)) + .map( + sets -> sets + .stream() + .map(set -> parseSetInfo(set)) + .filter(t -> ids.contains(t.getLeft())) + .map(t -> buildDirectory(basePath, t)) + .collect(Collectors.toList())) + .orElseThrow(() -> new IllegalStateException("empty set list")); + } catch (ActionManagerException | ISLookUpException e) { + throw new IllegalStateException("unable to query ActionSets info from the IS"); } } - private ActionManagerSet getActionManagerSet(final String basePath, final String profile) - throws ActionManagerException { - final SAXReader reader = new SAXReader(); - final ActionManagerSet set = new ActionManagerSet(); - + private Triple parseSetInfo(String set) { try { - final Document doc = reader.read(new StringReader(profile)); - - set.setId(doc.valueOf("//SET/@id").trim()); - set.setName(doc.valueOf("//SET").trim()); - set.setImpact(ImpactTypes.valueOf(doc.valueOf("//IMPACT").trim())); - set - .setLatest( - doc.valueOf("//RAW_SETS/LATEST/@id"), - doc.valueOf("//RAW_SETS/LATEST/@creationDate"), - doc.valueOf("//RAW_SETS/LATEST/@lastUpdate")); - set.setDirectory(doc.valueOf("//SET/@directory")); - final List expiredNodes = doc.selectNodes("//RAW_SETS/EXPIRED"); - if (expiredNodes != null) { - for (int i = 0; i < expiredNodes.size(); i++) { - Element ex = (Element) expiredNodes.get(i); - set - .addExpired( - ex.attributeValue("id"), - ex.attributeValue("creationDate"), - ex.attributeValue("lastUpdate")); - } - } - - final StringBuilder sb = new StringBuilder(); - sb.append(basePath); - sb.append("/"); - sb.append(doc.valueOf("//SET/@directory")); - sb.append("/"); - sb.append(doc.valueOf("//RAW_SETS/LATEST/@id")); - set.setPathToLatest(sb.toString()); - - return set; - } catch (Exception e) { - throw new ActionManagerException("Error creating set from profile: " + profile, e); + Document doc = new SAXReader().read(new StringReader(set)); + return Triple + .of( + doc.valueOf("//SET/@id"), + doc.valueOf("//SET/@directory"), + doc.valueOf("//SET/@latest")); + } catch (DocumentException e) { + throw new IllegalStateException(e); } } + private String buildDirectory(String basePath, Triple t) { + return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight()); + } + private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException { return queryServiceProperty(isLookup, "basePath"); } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java index 0052026d4..7893fcf8b 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java @@ -160,9 +160,9 @@ public class PromoteActionPayloadForGraphTableJob { private static String extractPayload(Row value) { try { - return value. getAs("payload"); + return value.getAs("payload"); } catch (IllegalArgumentException | ClassCastException e) { - logger.error("cannot extract payload from action: {}", value.toString()); + logger.error("cannot extract payload from action: {}", value); throw e; } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/CollectAndSave.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/CollectAndSave.java index 0bebe2fb0..4b9fd33f4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/CollectAndSave.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/CollectAndSave.java @@ -75,7 +75,6 @@ public class CollectAndSave implements Serializable { .union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class)) .union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class)) .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); - ; } private static void removeOutputDir(SparkSession spark, String path) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java index 2cd37d9ea..cea8c2891 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java @@ -36,7 +36,7 @@ import scala.Tuple2; */ public class SparkAtomicActionScoreJob implements Serializable { - private static String DOI = "doi"; + private static final String DOI = "doi"; private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala index 979ab4371..5c49791d9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala @@ -17,6 +17,7 @@ import org.json4s.jackson.JsonMethods.parse import java.nio.charset.CodingErrorAction import java.text.SimpleDateFormat import java.time.LocalDate +import java.time.chrono.ThaiBuddhistDate import java.time.format.DateTimeFormatter import java.util.{Date, Locale} import java.util.regex.Pattern @@ -164,6 +165,16 @@ object DataciteToOAFTransformation { d } + def fix_thai_date(input:String, format:String) :String = { + try { + val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format)) + val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth) + LocalDate.from(d).toString + } catch { + case _: Throwable => "" + } + } + def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = { if (resourceType != null && resourceType.nonEmpty) { val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType) @@ -377,17 +388,31 @@ object DataciteToOAFTransformation { .map(d => d.get) if (a_date.isDefined) { - result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null)) + if(doi.startsWith("10.14457")) + result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null)) + else + result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null)) } if (i_date.isDefined && i_date.get.isDefined) { - result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) - result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + if(doi.startsWith("10.14457")) { + result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null)) + } + else { + result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + } } else if (publication_year != null) { - result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) - result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) - } + if(doi.startsWith("10.14457")) { + result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null)) + } else { + result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + } + } result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined) .map(d => (extract_date(d.date.get), d.dateType.get)) @@ -468,11 +493,11 @@ object DataciteToOAFTransformation { JField("awardUri", JString(awardUri)) <- fundingReferences } yield awardUri - val relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null) - fix_figshare(result) result.setId(IdentifierFactory.createIdentifier(result)) if (result.getId == null) return List() + val relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null) + fix_figshare(result) if (relations != null && relations.nonEmpty) { List(result) ::: relations } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala index d6101ba7a..931ac06f6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala @@ -56,6 +56,7 @@ object ImportDatacite { val hdfsTargetPath = new Path(targetPath) log.info(s"hdfsTargetPath is $hdfsTargetPath") + val bs = if (parser.get("blocksize") == null) 100 else parser.get("blocksize").toInt val spkipImport = parser.get("skipImport") log.info(s"skipImport is $spkipImport") @@ -110,7 +111,7 @@ object ImportDatacite { println(s"last Timestamp is $ts") - val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf) + val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs) println(s"Imported from Datacite API $cnt documents") @@ -137,7 +138,7 @@ object ImportDatacite { } } - private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration): Long = { + private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = { var from:Long = timestamp * 1000 val delta:Long = 50000000L var client: DataciteAPIImporter = null @@ -148,7 +149,7 @@ object ImportDatacite { try { var start: Long = System.currentTimeMillis while (from < now) { - client = new DataciteAPIImporter(from, 100, from + delta) + client = new DataciteAPIImporter(from, bs, from + delta) var end: Long = 0 val key: IntWritable = new IntWritable(i) val value: Text = new Text diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java index b2d3253d5..9e852eb77 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java @@ -143,24 +143,8 @@ public class PrepareProgramme { JavaRDD h2020Programmes = programme .toJavaRDD() - .filter(p -> p.getFrameworkProgramme().trim().equalsIgnoreCase("H2020")) .mapToPair(csvProgramme -> new Tuple2<>(csvProgramme.getCode(), csvProgramme)) - .reduceByKey((a, b) -> { - if (!a.getLanguage().equals("en")) { - if (b.getLanguage().equalsIgnoreCase("en")) { - a.setTitle(b.getTitle()); - a.setLanguage(b.getLanguage()); - } - } - if (StringUtils.isEmpty(a.getShortTitle())) { - if (!StringUtils.isEmpty(b.getShortTitle())) { - a.setShortTitle(b.getShortTitle()); - } - } - - return a; - - }) + .reduceByKey(PrepareProgramme::groupProgrammeByCode) .map(p -> { CSVProgramme csvProgramme = p._2(); String programmeTitle = csvProgramme.getTitle().trim(); @@ -177,20 +161,31 @@ public class PrepareProgramme { return csvProgramme; }); - // prepareClassification(h2020Programmes); - - JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD rdd = jsc.parallelize(prepareClassification(h2020Programmes), 1); rdd - .map(csvProgramme -> { - String tmp = OBJECT_MAPPER.writeValueAsString(csvProgramme); - return tmp; - }) + .map(OBJECT_MAPPER::writeValueAsString) .saveAsTextFile(outputPath); } + private static CSVProgramme groupProgrammeByCode(CSVProgramme a, CSVProgramme b) { + if (!a.getLanguage().equals("en")) { + if (b.getLanguage().equalsIgnoreCase("en")) { + a.setTitle(b.getTitle()); + a.setLanguage(b.getLanguage()); + } + } + if (StringUtils.isEmpty(a.getShortTitle())) { + if (!StringUtils.isEmpty(b.getShortTitle())) { + a.setShortTitle(b.getShortTitle()); + } + } + + return a; + } + private static List prepareClassification(JavaRDD h2020Programmes) { Object[] codedescription = h2020Programmes .map( @@ -241,15 +236,15 @@ public class PrepareProgramme { if (!ent.contains("Euratom")) { String parent; - String tmp_key = tmp[0] + "."; + String tmpKey = tmp[0] + "."; for (int i = 1; i < tmp.length - 1; i++) { - tmp_key += tmp[i] + "."; - parent = map.get(tmp_key)._1().toLowerCase().trim(); + tmpKey += tmp[i] + "."; + parent = map.get(tmpKey)._1().toLowerCase().trim(); if (parent.contains("|")) { parent = parent.substring(parent.lastIndexOf("|") + 1).trim(); } if (current.trim().length() > parent.length() - && current.toLowerCase().trim().substring(0, parent.length()).equals(parent)) { + && current.toLowerCase().trim().startsWith(parent)) { current = current.substring(parent.length() + 1); if (current.trim().charAt(0) == '-' || current.trim().charAt(0) == '–') { current = current.trim().substring(1).trim(); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java index e5cae0ff7..cecd537ba 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java @@ -18,7 +18,6 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme; import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; @@ -32,7 +31,6 @@ public class PrepareProjects { private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final HashMap programmeMap = new HashMap<>(); public static void main(String[] args) throws Exception { @@ -93,7 +91,7 @@ public class PrepareProjects { } private static FlatMapFunction, CSVProject> getTuple2CSVProjectFlatMapFunction() { - return (FlatMapFunction, CSVProject>) value -> { + return value -> { Optional csvProject = Optional.ofNullable(value._2()); List csvProjectList = new ArrayList<>(); if (csvProject.isPresent()) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java index a583b7bfa..fdc12c662 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java @@ -120,7 +120,6 @@ public class SparkAtomicActionJob { .map((MapFunction, Project>) c -> { CSVProject csvProject = c._1(); - Optional ocsvProgramme = Optional.ofNullable(c._2()); return Optional .ofNullable(c._2()) @@ -135,9 +134,9 @@ public class SparkAtomicActionJob { H2020Programme pm = new H2020Programme(); H2020Classification h2020classification = new H2020Classification(); pm.setCode(csvProject.getProgramme()); - h2020classification.setClassification(ocsvProgramme.get().getClassification()); + h2020classification.setClassification(csvProgramme.getClassification()); h2020classification.setH2020Programme(pm); - setLevelsandProgramme(h2020classification, ocsvProgramme.get().getClassification_short()); + setLevelsandProgramme(h2020classification, csvProgramme.getClassification_short()); // setProgramme(h2020classification, ocsvProgramme.get().getClassification()); pp.setH2020classification(Arrays.asList(h2020classification)); @@ -145,10 +144,11 @@ public class SparkAtomicActionJob { }) .orElse(null); - }, Encoders.bean(Project.class)); + }, Encoders.bean(Project.class)) + .filter(Objects::nonNull); aaproject - .joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code"))) + .joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")), "left") .map((MapFunction, Project>) p -> { Optional op = Optional.ofNullable(p._2()); Project rp = p._1(); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java index f991a4297..d486f0104 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java @@ -7,14 +7,7 @@ import java.io.Serializable; * The model for the programme csv file */ public class CSVProgramme implements Serializable { - private String parentProgramme; - private String frameworkProgramme; - private String startDate; - private String endDate; - private String objective; - private String subjects; - private String legalBasis; - private String call; + private String rcn; private String code; @@ -80,67 +73,5 @@ public class CSVProgramme implements Serializable { this.language = language; } - public String getParentProgramme() { - return parentProgramme; - } - - public void setParentProgramme(String parentProgramme) { - this.parentProgramme = parentProgramme; - } - - public String getFrameworkProgramme() { - return frameworkProgramme; - } - - public void setFrameworkProgramme(String frameworkProgramme) { - this.frameworkProgramme = frameworkProgramme; - } - - public String getStartDate() { - return startDate; - } - - public void setStartDate(String startDate) { - this.startDate = startDate; - } - - public String getEndDate() { - return endDate; - } - - public void setEndDate(String endDate) { - this.endDate = endDate; - } - - public String getObjective() { - return objective; - } - - public void setObjective(String objective) { - this.objective = objective; - } - - public String getSubjects() { - return subjects; - } - - public void setSubjects(String subjects) { - this.subjects = subjects; - } - - public String getLegalBasis() { - return legalBasis; - } - - public void setLegalBasis(String legalBasis) { - this.legalBasis = legalBasis; - } - - public String getCall() { - return call; - } - - public void setCall(String call) { - this.call = call; - } +// } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java index 0f83499e4..5f5b61d8b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java @@ -22,15 +22,18 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook; */ public class EXCELParser { - public List parse(InputStream file, String classForName) + public List parse(InputStream file, String classForName, String sheetName) throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException, InvalidFormatException { - // OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL)); OPCPackage pkg = OPCPackage.open(file); XSSFWorkbook wb = new XSSFWorkbook(pkg); - XSSFSheet sheet = wb.getSheet("cordisref-H2020topics"); + XSSFSheet sheet = wb.getSheet(sheetName); + + if (sheetName == null) { + throw new RuntimeException("Sheet name " + sheetName + " not present in current file"); + } List ret = new ArrayList<>(); @@ -49,12 +52,11 @@ public class EXCELParser { headers.add(dataFormatter.formatCellValue(cell)); } } else { - Class clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic"); + Class clazz = Class.forName(classForName); final Object cc = clazz.newInstance(); for (int i = 0; i < headers.size(); i++) { Cell cell = row.getCell(i); - String value = dataFormatter.formatCellValue(cell); FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java index cad6b94e1..c73f7ec3d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java @@ -28,7 +28,7 @@ public class ReadCSV implements Closeable { private final Configuration conf; private final BufferedWriter writer; private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private String csvFile; + private final String csvFile; public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -85,7 +85,6 @@ public class ReadCSV implements Closeable { this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); this.csvFile = httpConnector.getInputSource(fileURL); - ; } protected void write(final Object p) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java index fc3b38ac5..5ce0a681c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java @@ -25,7 +25,7 @@ public class ReadExcel implements Closeable { private final Configuration conf; private final BufferedWriter writer; private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private InputStream excelFile; + private final InputStream excelFile; public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -41,19 +41,20 @@ public class ReadExcel implements Closeable { final String hdfsPath = parser.get("hdfsPath"); final String hdfsNameNode = parser.get("hdfsNameNode"); final String classForName = parser.get("classForName"); + final String sheetName = parser.get("sheetName"); try (final ReadExcel readExcel = new ReadExcel(hdfsPath, hdfsNameNode, fileURL)) { log.info("Getting Excel file..."); - readExcel.execute(classForName); + readExcel.execute(classForName, sheetName); } } - public void execute(final String classForName) throws Exception { + public void execute(final String classForName, final String sheetName) throws Exception { EXCELParser excelParser = new EXCELParser(); excelParser - .parse(excelFile, classForName) + .parse(excelFile, classForName, sheetName) .stream() .forEach(p -> write(p)); @@ -82,7 +83,6 @@ public class ReadExcel implements Closeable { this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); this.excelFile = httpConnector.getInputSourceAsStream(fileURL); - ; } protected void write(final Object p) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java new file mode 100644 index 000000000..d6c17e415 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java @@ -0,0 +1,215 @@ + +package eu.dnetlib.dhp.actionmanager.ror; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob; +import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType; +import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.utils.DHPUtils; +import scala.Tuple2; + +public class GenerateRorActionSetJob { + + private static final Logger log = LoggerFactory.getLogger(GenerateRorActionSetJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static final String ROR_NS_PREFIX = "ror_________"; + + private static final List ROR_COLLECTED_FROM = listKeyValues( + "10|openaire____::993a7ae7a863813cf95028b50708e222", "ROR"); + + private static final DataInfo ROR_DATA_INFO = dataInfo( + false, "", false, false, ENTITYREGISTRY_PROVENANCE_ACTION, "0.92"); + + private static final Qualifier ROR_PID_TYPE = qualifier( + "ROR", "ROR", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES); + + public static void main(final String[] args) throws Exception { + + final String jsonConfiguration = IOUtils + .toString( + SparkAtomicActionJob.class + .getResourceAsStream("/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); + processRorOrganizations(spark, inputPath, outputPath); + }); + } + + private static void removeOutputDir(final SparkSession spark, final String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + + private static void processRorOrganizations(final SparkSession spark, + final String inputPath, + final String outputPath) throws Exception { + + readInputPath(spark, inputPath) + .map( + (MapFunction) GenerateRorActionSetJob::convertRorOrg, + Encoders.bean(Organization.class)) + .toJavaRDD() + .map(o -> new AtomicAction<>(Organization.class, o)) + .mapToPair( + aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), + new Text(OBJECT_MAPPER.writeValueAsString(aa)))) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + } + + protected static Organization convertRorOrg(final RorOrganization r) { + + final Date now = new Date(); + + final Organization o = new Organization(); + + o.setId(String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(r.getId()))); + o.setOriginalId(Arrays.asList(String.format("%s::%s", ROR_NS_PREFIX, r.getId()))); + o.setCollectedfrom(ROR_COLLECTED_FROM); + o.setPid(pids(r)); + o.setDateofcollection(now.toString()); + o.setDateoftransformation(now.toString()); + o.setExtraInfo(new ArrayList<>()); // Values not present in the file + o.setOaiprovenance(null); // Values not present in the file + o.setLegalshortname(field(r.getAcronyms().stream().findFirst().orElse(r.getName()), ROR_DATA_INFO)); + o.setLegalname(field(r.getName(), ROR_DATA_INFO)); + o.setAlternativeNames(alternativeNames(r)); + o.setWebsiteurl(field(r.getLinks().stream().findFirst().orElse(null), ROR_DATA_INFO)); + o.setLogourl(null); + o.setEclegalbody(null); + o.setEclegalperson(null); + o.setEcnonprofit(null); + o.setEcresearchorganization(null); + o.setEchighereducation(null); + o.setEcinternationalorganizationeurinterests(null); + o.setEcinternationalorganization(null); + o.setEcenterprise(null); + o.setEcsmevalidated(null); + o.setEcnutscode(null); + if (r.getCountry() != null) { + o + .setCountry( + qualifier( + r.getCountry().getCountryCode(), r + .getCountry() + .getCountryName(), + ModelConstants.DNET_COUNTRY_TYPE, ModelConstants.DNET_COUNTRY_TYPE)); + } else { + o.setCountry(null); + } + o.setDataInfo(ROR_DATA_INFO); + o.setLastupdatetimestamp(now.getTime()); + + return o; + } + + private static List pids(final RorOrganization r) { + final List pids = new ArrayList<>(); + pids.add(structuredProperty(r.getId(), ROR_PID_TYPE, ROR_DATA_INFO)); + + for (final Map.Entry e : r.getExternalIds().entrySet()) { + final String type = e.getKey(); + final List all = e.getValue().getAll(); + if (all != null) { + final Qualifier qualifier = qualifier( + type, type, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES); + for (final String pid : all) { + pids + .add(structuredProperty(pid, qualifier, ROR_DATA_INFO)); + } + } + } + + return pids; + } + + private static List> alternativeNames(final RorOrganization r) { + final Set names = new LinkedHashSet<>(); + names.addAll(r.getAliases()); + names.addAll(r.getAcronyms()); + r.getLabels().forEach(l -> names.add(l.getLabel())); + + return names + .stream() + .filter(StringUtils::isNotBlank) + .map(s -> field(s, ROR_DATA_INFO)) + .collect(Collectors.toList()); + } + + private static Dataset readInputPath( + final SparkSession spark, + final String path) throws Exception { + + try (final FileSystem fileSystem = FileSystem.get(new Configuration()); + final InputStream is = fileSystem.open(new Path(path))) { + final RorOrganization[] arr = OBJECT_MAPPER.readValue(is, RorOrganization[].class); + return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class)); + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Address.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Address.java new file mode 100644 index 000000000..b566a5501 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Address.java @@ -0,0 +1,122 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class Address implements Serializable { + + @JsonProperty("lat") + private Float lat; + + @JsonProperty("state_code") + private String stateCode; + + @JsonProperty("country_geonames_id") + private Integer countryGeonamesId; + + @JsonProperty("lng") + private Float lng; + + @JsonProperty("state") + private String state; + + @JsonProperty("city") + private String city; + + @JsonProperty("geonames_city") + private GeonamesCity geonamesCity; + + @JsonProperty("postcode") + private String postcode; + + @JsonProperty("primary") + private Boolean primary; + + @JsonProperty("line") + private String line; + + private final static long serialVersionUID = 2444635485253443195L; + + public Float getLat() { + return lat; + } + + public void setLat(final Float lat) { + this.lat = lat; + } + + public String getStateCode() { + return stateCode; + } + + public void setStateCode(final String stateCode) { + this.stateCode = stateCode; + } + + public Integer getCountryGeonamesId() { + return countryGeonamesId; + } + + public void setCountryGeonamesId(final Integer countryGeonamesId) { + this.countryGeonamesId = countryGeonamesId; + } + + public Float getLng() { + return lng; + } + + public void setLng(final Float lng) { + this.lng = lng; + } + + public String getState() { + return state; + } + + public void setState(final String state) { + this.state = state; + } + + public String getCity() { + return city; + } + + public void setCity(final String city) { + this.city = city; + } + + public GeonamesCity getGeonamesCity() { + return geonamesCity; + } + + public void setGeonamesCity(final GeonamesCity geonamesCity) { + this.geonamesCity = geonamesCity; + } + + public String getPostcode() { + return postcode; + } + + public void setPostcode(final String postcode) { + this.postcode = postcode; + } + + public Boolean getPrimary() { + return primary; + } + + public void setPrimary(final Boolean primary) { + this.primary = primary; + } + + public String getLine() { + return line; + } + + public void setLine(final String line) { + this.line = line; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Country.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Country.java new file mode 100644 index 000000000..3dab60a9f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Country.java @@ -0,0 +1,34 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class Country implements Serializable { + + @JsonProperty("country_code") + private String countryCode; + + @JsonProperty("country_name") + private String countryName; + + private final static long serialVersionUID = 4357848706229493627L; + + public String getCountryCode() { + return countryCode; + } + + public void setCountryCode(final String countryCode) { + this.countryCode = countryCode; + } + + public String getCountryName() { + return countryName; + } + + public void setCountryName(final String countryName) { + this.countryName = countryName; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/ExternalIdType.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/ExternalIdType.java new file mode 100644 index 000000000..406bfd82c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/ExternalIdType.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; +import java.util.List; + +import com.fasterxml.jackson.databind.annotation.JsonDeserialize; + +@JsonDeserialize(using = ExternalIdTypeDeserializer.class) +public class ExternalIdType implements Serializable { + + private List all; + + private String preferred; + + private final static long serialVersionUID = 2616688352998387611L; + + public ExternalIdType() { + } + + public ExternalIdType(final List all, final String preferred) { + this.all = all; + this.preferred = preferred; + } + + public List getAll() { + return all; + } + + public void setAll(final List all) { + this.all = all; + } + + public String getPreferred() { + return preferred; + } + + public void setPreferred(final String preferred) { + this.preferred = preferred; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/ExternalIdTypeDeserializer.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/ExternalIdTypeDeserializer.java new file mode 100644 index 000000000..3fd0c9250 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/ExternalIdTypeDeserializer.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.ObjectCodec; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonDeserializer; +import com.fasterxml.jackson.databind.JsonNode; + +public class ExternalIdTypeDeserializer extends JsonDeserializer { + + @Override + public ExternalIdType deserialize(final JsonParser p, final DeserializationContext ctxt) + throws IOException, JsonProcessingException { + final ObjectCodec oc = p.getCodec(); + final JsonNode node = oc.readTree(p); + + final JsonNode allNode = node.get("all"); + + final String preferred = node.get("preferred").asText(); + + final List all = new ArrayList<>(); + + if (allNode.isArray()) { + allNode.elements().forEachRemaining(x -> all.add(x.asText())); + } else { + all.add(allNode.asText()); + } + + return new ExternalIdType(all, preferred); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/GeonamesAdmin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/GeonamesAdmin.java new file mode 100644 index 000000000..9616db447 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/GeonamesAdmin.java @@ -0,0 +1,56 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class GeonamesAdmin implements Serializable { + + @JsonProperty("ascii_name") + private String asciiName; + + @JsonProperty("id") + private Integer id; + + @JsonProperty("name") + private String name; + + @JsonProperty("code") + private String code; + + private final static long serialVersionUID = 7294958526269195673L; + + public String getAsciiName() { + return asciiName; + } + + public void setAsciiName(final String asciiName) { + this.asciiName = asciiName; + } + + public Integer getId() { + return id; + } + + public void setId(final Integer id) { + this.id = id; + } + + public String getName() { + return name; + } + + public void setName(final String name) { + this.name = name; + } + + public String getCode() { + return code; + } + + public void setCode(final String code) { + this.code = code; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/GeonamesCity.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/GeonamesCity.java new file mode 100644 index 000000000..2b0487168 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/GeonamesCity.java @@ -0,0 +1,100 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class GeonamesCity implements Serializable { + + @JsonProperty("geonames_admin1") + private GeonamesAdmin geonamesAdmin1; + + @JsonProperty("geonames_admin2") + private GeonamesAdmin geonamesAdmin2; + + @JsonProperty("city") + private String city; + + @JsonProperty("id") + private Integer id; + + @JsonProperty("nuts_level1") + private NameAndCode nutsLevel1; + + @JsonProperty("nuts_level2") + private NameAndCode nutsLevel2; + + @JsonProperty("nuts_level3") + private NameAndCode nutsLevel3; + + @JsonProperty("license") + private License license; + + private final static long serialVersionUID = -8389480201526252955L; + + public NameAndCode getNutsLevel2() { + return nutsLevel2; + } + + public void setNutsLevel2(final NameAndCode nutsLevel2) { + this.nutsLevel2 = nutsLevel2; + } + + public GeonamesAdmin getGeonamesAdmin2() { + return geonamesAdmin2; + } + + public void setGeonamesAdmin2(final GeonamesAdmin geonamesAdmin2) { + this.geonamesAdmin2 = geonamesAdmin2; + } + + public GeonamesAdmin getGeonamesAdmin1() { + return geonamesAdmin1; + } + + public void setGeonamesAdmin1(final GeonamesAdmin geonamesAdmin1) { + this.geonamesAdmin1 = geonamesAdmin1; + } + + public String getCity() { + return city; + } + + public void setCity(final String city) { + this.city = city; + } + + public Integer getId() { + return id; + } + + public void setId(final Integer id) { + this.id = id; + } + + public NameAndCode getNutsLevel1() { + return nutsLevel1; + } + + public void setNutsLevel1(final NameAndCode nutsLevel1) { + this.nutsLevel1 = nutsLevel1; + } + + public NameAndCode getNutsLevel3() { + return nutsLevel3; + } + + public void setNutsLevel3(final NameAndCode nutsLevel3) { + this.nutsLevel3 = nutsLevel3; + } + + public License getLicense() { + return license; + } + + public void setLicense(final License license) { + this.license = license; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Label.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Label.java new file mode 100644 index 000000000..61eb0339d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Label.java @@ -0,0 +1,34 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class Label implements Serializable { + + @JsonProperty("iso639") + private String iso639; + + @JsonProperty("label") + private String label; + + private final static long serialVersionUID = -6576156103297850809L; + + public String getIso639() { + return iso639; + } + + public void setIso639(final String iso639) { + this.iso639 = iso639; + } + + public String getLabel() { + return label; + } + + public void setLabel(final String label) { + this.label = label; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/License.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/License.java new file mode 100644 index 000000000..bdc8f4c42 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/License.java @@ -0,0 +1,34 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class License implements Serializable { + + @JsonProperty("attribution") + private String attribution; + + @JsonProperty("license") + private String license; + + private final static long serialVersionUID = -194308261058176439L; + + public String getAttribution() { + return attribution; + } + + public void setAttribution(final String attribution) { + this.attribution = attribution; + } + + public String getLicense() { + return license; + } + + public void setLicense(final String license) { + this.license = license; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/NameAndCode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/NameAndCode.java new file mode 100644 index 000000000..61d7eb8e6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/NameAndCode.java @@ -0,0 +1,34 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class NameAndCode implements Serializable { + + @JsonProperty("name") + private String name; + + @JsonProperty("code") + private String code; + + private final static long serialVersionUID = 5459836979206140843L; + + public String getName() { + return name; + } + + public void setName(final String name) { + this.name = name; + } + + public String getCode() { + return code; + } + + public void setCode(final String code) { + this.code = code; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Relationship.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Relationship.java new file mode 100644 index 000000000..8b73db98f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Relationship.java @@ -0,0 +1,45 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class Relationship implements Serializable { + + @JsonProperty("type") + private String type; + + @JsonProperty("id") + private String id; + + @JsonProperty("label") + private String label; + + private final static long serialVersionUID = 7847399503395576960L; + + public String getType() { + return type; + } + + public void setType(final String type) { + this.type = type; + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public String getLabel() { + return label; + } + + public void setLabel(final String label) { + this.label = label; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/RorOrganization.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/RorOrganization.java new file mode 100644 index 000000000..94de34fee --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/RorOrganization.java @@ -0,0 +1,192 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class RorOrganization implements Serializable { + + @JsonProperty("ip_addresses") + private List ipAddresses = new ArrayList<>(); + + @JsonProperty("aliases") + private List aliases = new ArrayList<>(); + + @JsonProperty("acronyms") + private List acronyms = new ArrayList<>(); + + @JsonProperty("links") + private List links = new ArrayList<>(); + + @JsonProperty("country") + private Country country; + + @JsonProperty("name") + private String name; + + @JsonProperty("wikipedia_url") + private String wikipediaUrl; + + @JsonProperty("addresses") + private List
addresses = new ArrayList<>(); + + @JsonProperty("types") + private List types = new ArrayList<>(); + + @JsonProperty("established") + private Integer established; + + @JsonProperty("relationships") + private List relationships = new ArrayList<>(); + + @JsonProperty("email_address") + private String emailAddress; + + @JsonProperty("external_ids") + private Map externalIds = new LinkedHashMap<>(); + + @JsonProperty("id") + private String id; + + @JsonProperty("labels") + private List