diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index acac3594f..74f31cf35 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -21,6 +21,10 @@
org.apache.hadoop
hadoop-common
+
+ com.github.sisyphsu
+ dateparser
+
org.apache.spark
spark-core_2.11
diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java
deleted file mode 100644
index 59fe941ed..000000000
--- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java
+++ /dev/null
@@ -1,182 +0,0 @@
-
-package eu.dnetlib.data.mdstore.manager.common.model;
-
-import java.io.Serializable;
-import java.util.Date;
-import java.util.Objects;
-import java.util.UUID;
-
-import javax.persistence.Column;
-import javax.persistence.Entity;
-import javax.persistence.Id;
-import javax.persistence.Table;
-import javax.persistence.Temporal;
-import javax.persistence.TemporalType;
-
-@Entity
-@Table(name = "mdstores")
-public class MDStore implements Serializable {
-
- /** */
- private static final long serialVersionUID = 3160530489149700055L;
-
- @Id
- @Column(name = "id")
- private String id;
-
- @Column(name = "format")
- private String format;
-
- @Column(name = "layout")
- private String layout;
-
- @Column(name = "interpretation")
- private String interpretation;
-
- @Column(name = "datasource_name")
- private String datasourceName;
-
- @Column(name = "datasource_id")
- private String datasourceId;
-
- @Column(name = "api_id")
- private String apiId;
-
- @Column(name = "hdfs_path")
- private String hdfsPath;
-
- @Column(name = "creation_date")
- @Temporal(TemporalType.TIMESTAMP)
- private Date creationDate;
-
- public String getId() {
- return id;
- }
-
- public void setId(final String id) {
- this.id = id;
- }
-
- public String getFormat() {
- return format;
- }
-
- public void setFormat(final String format) {
- this.format = format;
- }
-
- public String getLayout() {
- return layout;
- }
-
- public void setLayout(final String layout) {
- this.layout = layout;
- }
-
- public String getInterpretation() {
- return interpretation;
- }
-
- public void setInterpretation(final String interpretation) {
- this.interpretation = interpretation;
- }
-
- public String getDatasourceName() {
- return datasourceName;
- }
-
- public void setDatasourceName(final String datasourceName) {
- this.datasourceName = datasourceName;
- }
-
- public String getDatasourceId() {
- return datasourceId;
- }
-
- public void setDatasourceId(final String datasourceId) {
- this.datasourceId = datasourceId;
- }
-
- public String getApiId() {
- return apiId;
- }
-
- public void setApiId(final String apiId) {
- this.apiId = apiId;
- }
-
- public String getHdfsPath() {
- return hdfsPath;
- }
-
- public void setHdfsPath(final String hdfsPath) {
- this.hdfsPath = hdfsPath;
- }
-
- public Date getCreationDate() {
- return creationDate;
- }
-
- public void setCreationDate(final Date creationDate) {
- this.creationDate = creationDate;
- }
-
- public static MDStore newInstance(
- final String format,
- final String layout,
- final String interpretation,
- final String hdfsBasePath) {
- return newInstance(format, layout, interpretation, null, null, null, hdfsBasePath);
- }
-
- public static MDStore newInstance(
- final String format,
- final String layout,
- final String interpretation,
- final String dsName,
- final String dsId,
- final String apiId,
- final String hdfsBasePath) {
-
- final String mdId = "md-" + UUID.randomUUID();
-
- final MDStore md = new MDStore();
- md.setId(mdId);
- md.setFormat(format);
- md.setLayout(layout);
- md.setInterpretation(interpretation);
- md.setCreationDate(new Date());
- md.setDatasourceName(dsName);
- md.setDatasourceId(dsId);
- md.setApiId(apiId);
- md.setHdfsPath(String.format("%s/%s", hdfsBasePath, mdId));
-
- return md;
- }
-
- @Override
- public String toString() {
- return String
- .format(
- "MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]",
- id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(id);
- }
-
- @Override
- public boolean equals(final Object obj) {
- if (this == obj) {
- return true;
- }
- if (!(obj instanceof MDStore)) {
- return false;
- }
- final MDStore other = (MDStore) obj;
- return Objects.equals(id, other.id);
- }
-
-}
diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java
deleted file mode 100644
index d808e2de7..000000000
--- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java
+++ /dev/null
@@ -1,74 +0,0 @@
-
-package eu.dnetlib.data.mdstore.manager.common.model;
-
-import java.io.Serializable;
-import java.util.Objects;
-
-import javax.persistence.Column;
-import javax.persistence.Entity;
-import javax.persistence.Id;
-import javax.persistence.Table;
-
-@Entity
-@Table(name = "mdstore_current_versions")
-public class MDStoreCurrentVersion implements Serializable {
-
- /** */
- private static final long serialVersionUID = -4757725888593745773L;
-
- @Id
- @Column(name = "mdstore")
- private String mdstore;
-
- @Column(name = "current_version")
- private String currentVersion;
-
- public String getMdstore() {
- return mdstore;
- }
-
- public void setMdstore(final String mdstore) {
- this.mdstore = mdstore;
- }
-
- public String getCurrentVersion() {
- return currentVersion;
- }
-
- public void setCurrentVersion(final String currentVersion) {
- this.currentVersion = currentVersion;
- }
-
- public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) {
- final MDStoreCurrentVersion cv = new MDStoreCurrentVersion();
- cv.setMdstore(mdId);
- cv.setCurrentVersion(versionId);
- return cv;
- }
-
- public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) {
- return newInstance(v.getMdstore(), v.getId());
- }
-
- @Override
- public String toString() {
- return String.format("MDStoreCurrentVersion [mdstore=%s, currentVersion=%s]", mdstore, currentVersion);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(currentVersion, mdstore);
- }
-
- @Override
- public boolean equals(final Object obj) {
- if (this == obj) {
- return true;
- }
- if (!(obj instanceof MDStoreCurrentVersion)) {
- return false;
- }
- final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj;
- return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore);
- }
-}
diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java
deleted file mode 100644
index 38f8f275e..000000000
--- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java
+++ /dev/null
@@ -1,140 +0,0 @@
-
-package eu.dnetlib.data.mdstore.manager.common.model;
-
-import java.io.Serializable;
-import java.util.Date;
-import java.util.Objects;
-
-import javax.persistence.Column;
-import javax.persistence.Entity;
-import javax.persistence.Id;
-import javax.persistence.Table;
-import javax.persistence.Temporal;
-import javax.persistence.TemporalType;
-
-@Entity
-@Table(name = "mdstore_versions")
-public class MDStoreVersion implements Serializable {
-
- /** */
- private static final long serialVersionUID = -4763494442274298339L;
-
- @Id
- @Column(name = "id")
- private String id;
-
- @Column(name = "mdstore")
- private String mdstore;
-
- @Column(name = "writing")
- private boolean writing;
-
- @Column(name = "readcount")
- private int readCount = 0;
-
- @Column(name = "lastupdate")
- @Temporal(TemporalType.TIMESTAMP)
- private Date lastUpdate;
-
- @Column(name = "size")
- private long size = 0;
-
- @Column(name = "hdfs_path")
- private String hdfsPath;
-
- public static MDStoreVersion newInstance(final String mdId, final boolean writing, final String hdfsBasePath) {
- final MDStoreVersion v = new MDStoreVersion();
-
- final String versionId = mdId + "-" + new Date().getTime();
- v.setId(versionId);
- v.setMdstore(mdId);
- v.setLastUpdate(null);
- v.setWriting(writing);
- v.setReadCount(0);
- v.setSize(0);
- v.setHdfsPath(String.format("%s/%s/%s", hdfsBasePath, mdId, versionId));
-
- return v;
- }
-
- public String getId() {
- return id;
- }
-
- public void setId(final String id) {
- this.id = id;
- }
-
- public String getMdstore() {
- return mdstore;
- }
-
- public void setMdstore(final String mdstore) {
- this.mdstore = mdstore;
- }
-
- public boolean isWriting() {
- return writing;
- }
-
- public void setWriting(final boolean writing) {
- this.writing = writing;
- }
-
- public int getReadCount() {
- return readCount;
- }
-
- public void setReadCount(final int readCount) {
- this.readCount = readCount;
- }
-
- public Date getLastUpdate() {
- return lastUpdate;
- }
-
- public void setLastUpdate(final Date lastUpdate) {
- this.lastUpdate = lastUpdate;
- }
-
- public long getSize() {
- return size;
- }
-
- public void setSize(final long size) {
- this.size = size;
- }
-
- public String getHdfsPath() {
- return hdfsPath;
- }
-
- public void setHdfsPath(final String hdfsPath) {
- this.hdfsPath = hdfsPath;
- }
-
- @Override
- public String toString() {
- return String
- .format(
- "MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id,
- mdstore, writing, readCount, lastUpdate, size, hdfsPath);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(id);
- }
-
- @Override
- public boolean equals(final Object obj) {
- if (this == obj) {
- return true;
- }
- if (!(obj instanceof MDStoreVersion)) {
- return false;
- }
- final MDStoreVersion other = (MDStoreVersion) obj;
- return Objects.equals(id, other.id);
- }
-}
diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java
deleted file mode 100644
index 510c65092..000000000
--- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java
+++ /dev/null
@@ -1,194 +0,0 @@
-
-package eu.dnetlib.data.mdstore.manager.common.model;
-
-import java.io.Serializable;
-import java.util.Date;
-import java.util.Objects;
-
-import javax.persistence.Column;
-import javax.persistence.Entity;
-import javax.persistence.Id;
-import javax.persistence.Table;
-import javax.persistence.Temporal;
-import javax.persistence.TemporalType;
-
-@Entity
-@Table(name = "mdstores_with_info")
-public class MDStoreWithInfo implements Serializable {
-
- /** */
- private static final long serialVersionUID = -8445784770687571492L;
-
- @Id
- @Column(name = "id")
- private String id;
-
- @Column(name = "format")
- private String format;
-
- @Column(name = "layout")
- private String layout;
-
- @Column(name = "interpretation")
- private String interpretation;
-
- @Column(name = "datasource_name")
- private String datasourceName;
-
- @Column(name = "datasource_id")
- private String datasourceId;
-
- @Column(name = "api_id")
- private String apiId;
-
- @Column(name = "current_version")
- private String currentVersion;
-
- @Column(name = "creation_date")
- @Temporal(TemporalType.TIMESTAMP)
- private Date creationDate;
-
- @Column(name = "lastupdate")
- @Temporal(TemporalType.TIMESTAMP)
- private Date lastUpdate;
-
- @Column(name = "size")
- private long size = 0;
-
- @Column(name = "n_versions")
- private long numberOfVersions = 0;
-
- @Column(name = "hdfs_path")
- private String hdfsPath;
-
- public String getId() {
- return id;
- }
-
- public void setId(final String id) {
- this.id = id;
- }
-
- public String getFormat() {
- return format;
- }
-
- public void setFormat(final String format) {
- this.format = format;
- }
-
- public String getLayout() {
- return layout;
- }
-
- public void setLayout(final String layout) {
- this.layout = layout;
- }
-
- public String getInterpretation() {
- return interpretation;
- }
-
- public void setInterpretation(final String interpretation) {
- this.interpretation = interpretation;
- }
-
- public String getDatasourceName() {
- return datasourceName;
- }
-
- public void setDatasourceName(final String datasourceName) {
- this.datasourceName = datasourceName;
- }
-
- public String getDatasourceId() {
- return datasourceId;
- }
-
- public void setDatasourceId(final String datasourceId) {
- this.datasourceId = datasourceId;
- }
-
- public String getApiId() {
- return apiId;
- }
-
- public void setApiId(final String apiId) {
- this.apiId = apiId;
- }
-
- public String getCurrentVersion() {
- return currentVersion;
- }
-
- public void setCurrentVersion(final String currentVersion) {
- this.currentVersion = currentVersion;
- }
-
- public Date getCreationDate() {
- return creationDate;
- }
-
- public void setCreationDate(final Date creationDate) {
- this.creationDate = creationDate;
- }
-
- public Date getLastUpdate() {
- return lastUpdate;
- }
-
- public void setLastUpdate(final Date lastUpdate) {
- this.lastUpdate = lastUpdate;
- }
-
- public long getSize() {
- return size;
- }
-
- public void setSize(final long size) {
- this.size = size;
- }
-
- public long getNumberOfVersions() {
- return numberOfVersions;
- }
-
- public void setNumberOfVersions(final long numberOfVersions) {
- this.numberOfVersions = numberOfVersions;
- }
-
- public String getHdfsPath() {
- return hdfsPath;
- }
-
- public void setHdfsPath(final String hdfsPath) {
- this.hdfsPath = hdfsPath;
- }
-
- @Override
- public String toString() {
- return String
- .format(
- "MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]",
- id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate,
- lastUpdate, size, numberOfVersions, hdfsPath);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(id);
- }
-
- @Override
- public boolean equals(final Object obj) {
- if (this == obj) {
- return true;
- }
- if (!(obj instanceof MDStoreWithInfo)) {
- return false;
- }
- final MDStoreWithInfo other = (MDStoreWithInfo) obj;
- return Objects.equals(id, other.id);
- }
-
-}
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
index cedc9bd4d..fabb25f16 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
@@ -14,7 +14,7 @@ public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class);
- private Connection connection;
+ private final Connection connection;
public DbClient(final String address, final String login, final String password) {
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
index 76017d5b7..7dc0e4417 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
@@ -100,7 +100,7 @@ public class MakeTarArchive implements Serializable {
BufferedInputStream bis = new BufferedInputStream(is);
int count;
- byte data[] = new byte[1024];
+ byte[] data = new byte[1024];
while ((count = bis.read(data, 0, data.length)) != -1) {
ar.write(data, 0, count);
}
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java
index c3f393436..c127783e5 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java
@@ -13,9 +13,9 @@ import okio.Source;
public class InputStreamRequestBody extends RequestBody {
- private InputStream inputStream;
- private MediaType mediaType;
- private long lenght;
+ private final InputStream inputStream;
+ private final MediaType mediaType;
+ private final long lenght;
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java
index 853d22bc2..98dabf56a 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java
@@ -21,7 +21,7 @@ public class DNetRestClient {
private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class);
- private static ObjectMapper mapper = new ObjectMapper();
+ private static final ObjectMapper mapper = new ObjectMapper();
public static T doGET(final String url, Class clazz) throws Exception {
final HttpGet httpGet = new HttpGet(url);
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java
index 0c6eacf99..deeda9beb 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java
@@ -34,7 +34,7 @@ public class MessageSender {
private final String workflowId;
- private ExecutorService executorService = Executors.newCachedThreadPool();
+ private final ExecutorService executorService = Executors.newCachedThreadPool();
public MessageSender(final String dnetMessageEndpoint, final String workflowId) {
this.workflowId = workflowId;
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
new file mode 100644
index 000000000..a75cc52e6
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@@ -0,0 +1,459 @@
+
+package eu.dnetlib.dhp.schema.oaf.utils;
+
+import java.time.LocalDate;
+import java.time.ZoneId;
+import java.time.format.DateTimeFormatter;
+import java.time.format.DateTimeParseException;
+import java.util.*;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.apache.commons.lang3.StringUtils;
+import org.jetbrains.annotations.NotNull;
+
+import com.github.sisyphsu.dateparser.DateParserUtils;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.*;
+
+public class GraphCleaningFunctions extends CleaningFunctions {
+
+ public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
+ public static final int ORCID_LEN = 19;
+ public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
+ public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
+ public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
+ public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
+
+ public static T fixVocabularyNames(T value) {
+ if (value instanceof Datasource) {
+ // nothing to clean here
+ } else if (value instanceof Project) {
+ // nothing to clean here
+ } else if (value instanceof Organization) {
+ Organization o = (Organization) value;
+ if (Objects.nonNull(o.getCountry())) {
+ fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE);
+ }
+ } else if (value instanceof Relation) {
+ // nothing to clean here
+ } else if (value instanceof Result) {
+
+ Result r = (Result) value;
+
+ fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES);
+ fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE);
+ fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES);
+
+ if (Objects.nonNull(r.getSubject())) {
+ r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES));
+ }
+ if (Objects.nonNull(r.getInstance())) {
+ for (Instance i : r.getInstance()) {
+ fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES);
+ fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS);
+ }
+ }
+ if (Objects.nonNull(r.getAuthor())) {
+ r.getAuthor().stream().filter(Objects::nonNull).forEach(a -> {
+ if (Objects.nonNull(a.getPid())) {
+ a.getPid().stream().filter(Objects::nonNull).forEach(p -> {
+ fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
+ });
+ }
+ });
+ }
+ if (value instanceof Publication) {
+
+ } else if (value instanceof Dataset) {
+
+ } else if (value instanceof OtherResearchProduct) {
+
+ } else if (value instanceof Software) {
+
+ }
+ }
+
+ return value;
+ }
+
+ public static boolean filter(T value) {
+ if (value instanceof Datasource) {
+ // nothing to evaluate here
+ } else if (value instanceof Project) {
+ // nothing to evaluate here
+ } else if (value instanceof Organization) {
+ // nothing to evaluate here
+ } else if (value instanceof Relation) {
+ // nothing to clean here
+ } else if (value instanceof Result) {
+
+ Result r = (Result) value;
+
+ if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) {
+ return false;
+ }
+
+ if (value instanceof Publication) {
+
+ } else if (value instanceof Dataset) {
+
+ } else if (value instanceof OtherResearchProduct) {
+
+ } else if (value instanceof Software) {
+
+ }
+ }
+ return true;
+ }
+
+ public static T cleanup(T value) {
+ if (value instanceof Datasource) {
+ // nothing to clean here
+ } else if (value instanceof Project) {
+ // nothing to clean here
+ } else if (value instanceof Organization) {
+ Organization o = (Organization) value;
+ if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
+ o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
+ }
+ } else if (value instanceof Relation) {
+ Relation r = (Relation) value;
+
+ Optional validationDate = doCleanDate(r.getValidationDate());
+ if (validationDate.isPresent()) {
+ r.setValidationDate(validationDate.get());
+ r.setValidated(true);
+ } else {
+ r.setValidationDate(null);
+ r.setValidated(false);
+ }
+ } else if (value instanceof Result) {
+
+ Result r = (Result) value;
+
+ if (Objects.nonNull(r.getDateofacceptance())) {
+ Optional date = cleanDateField(r.getDateofacceptance());
+ if (date.isPresent()) {
+ r.getDateofacceptance().setValue(date.get());
+ } else {
+ r.setDateofacceptance(null);
+ }
+ }
+ if (Objects.nonNull(r.getRelevantdate())) {
+ r
+ .setRelevantdate(
+ r
+ .getRelevantdate()
+ .stream()
+ .filter(Objects::nonNull)
+ .filter(sp -> Objects.nonNull(sp.getQualifier()))
+ .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
+ .map(sp -> {
+ sp.setValue(GraphCleaningFunctions.cleanDate(sp.getValue()));
+ return sp;
+ })
+ .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
+ .collect(Collectors.toList()));
+ }
+ if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
+ r.setPublisher(null);
+ }
+ if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
+ r
+ .setLanguage(
+ qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
+ }
+ if (Objects.nonNull(r.getSubject())) {
+ r
+ .setSubject(
+ r
+ .getSubject()
+ .stream()
+ .filter(Objects::nonNull)
+ .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
+ .filter(sp -> Objects.nonNull(sp.getQualifier()))
+ .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
+ .map(GraphCleaningFunctions::cleanValue)
+ .collect(Collectors.toList()));
+ }
+ if (Objects.nonNull(r.getTitle())) {
+ r
+ .setTitle(
+ r
+ .getTitle()
+ .stream()
+ .filter(Objects::nonNull)
+ .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
+ .filter(
+ sp -> sp
+ .getValue()
+ .toLowerCase()
+ .replaceAll(TITLE_FILTER_REGEX, "")
+ .length() > TITLE_FILTER_RESIDUAL_LENGTH)
+ .map(GraphCleaningFunctions::cleanValue)
+ .collect(Collectors.toList()));
+ }
+ if (Objects.nonNull(r.getDescription())) {
+ r
+ .setDescription(
+ r
+ .getDescription()
+ .stream()
+ .filter(Objects::nonNull)
+ .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
+ .map(GraphCleaningFunctions::cleanValue)
+ .collect(Collectors.toList()));
+ }
+ if (Objects.nonNull(r.getPid())) {
+ r.setPid(processPidCleaning(r.getPid()));
+ }
+ if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
+ r
+ .setResourcetype(
+ qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
+ }
+ if (Objects.nonNull(r.getInstance())) {
+
+ for (Instance i : r.getInstance()) {
+ if (Objects.nonNull(i.getPid())) {
+ i.setPid(processPidCleaning(i.getPid()));
+ }
+ if (Objects.nonNull(i.getAlternateIdentifier())) {
+ i.setAlternateIdentifier(processPidCleaning(i.getAlternateIdentifier()));
+ }
+ Optional
+ .ofNullable(i.getPid())
+ .ifPresent(pid -> {
+ final Set pids = Sets.newHashSet(pid);
+ Optional
+ .ofNullable(i.getAlternateIdentifier())
+ .ifPresent(altId -> {
+ final Set altIds = Sets.newHashSet(altId);
+ i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
+ });
+ });
+
+ if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
+ i
+ .setAccessright(
+ accessRight(
+ ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
+ ModelConstants.DNET_ACCESS_MODES));
+ }
+ if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
+ i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
+ }
+ if (Objects.isNull(i.getRefereed())) {
+ i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
+ }
+ if (Objects.nonNull(i.getDateofacceptance())) {
+ Optional date = cleanDateField(i.getDateofacceptance());
+ if (date.isPresent()) {
+ i.getDateofacceptance().setValue(date.get());
+ } else {
+ i.setDateofacceptance(null);
+ }
+ }
+ }
+ }
+ if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
+ Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
+ if (Objects.isNull(bestaccessrights)) {
+ r
+ .setBestaccessright(
+ qualifier(
+ ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
+ ModelConstants.DNET_ACCESS_MODES));
+ } else {
+ r.setBestaccessright(bestaccessrights);
+ }
+ }
+ if (Objects.nonNull(r.getAuthor())) {
+ r
+ .setAuthor(
+ r
+ .getAuthor()
+ .stream()
+ .filter(a -> Objects.nonNull(a))
+ .filter(a -> StringUtils.isNotBlank(a.getFullname()))
+ .filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
+ .collect(Collectors.toList()));
+
+ boolean nullRank = r
+ .getAuthor()
+ .stream()
+ .anyMatch(a -> Objects.isNull(a.getRank()));
+ if (nullRank) {
+ int i = 1;
+ for (Author author : r.getAuthor()) {
+ author.setRank(i++);
+ }
+ }
+
+ for (Author a : r.getAuthor()) {
+ if (Objects.isNull(a.getPid())) {
+ a.setPid(Lists.newArrayList());
+ } else {
+ a
+ .setPid(
+ a
+ .getPid()
+ .stream()
+ .filter(Objects::nonNull)
+ .filter(p -> Objects.nonNull(p.getQualifier()))
+ .filter(p -> StringUtils.isNotBlank(p.getValue()))
+ .map(p -> {
+ // hack to distinguish orcid from orcid_pending
+ String pidProvenance = Optional
+ .ofNullable(p.getDataInfo())
+ .map(
+ d -> Optional
+ .ofNullable(d.getProvenanceaction())
+ .map(Qualifier::getClassid)
+ .orElse(""))
+ .orElse("");
+ if (p
+ .getQualifier()
+ .getClassid()
+ .toLowerCase()
+ .contains(ModelConstants.ORCID)) {
+ if (pidProvenance
+ .equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
+ p.getQualifier().setClassid(ModelConstants.ORCID);
+ } else {
+ p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
+ }
+ final String orcid = p
+ .getValue()
+ .trim()
+ .toLowerCase()
+ .replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4");
+ if (orcid.length() == ORCID_LEN) {
+ p.setValue(orcid);
+ } else {
+ p.setValue("");
+ }
+ }
+ return p;
+ })
+ .filter(p -> StringUtils.isNotBlank(p.getValue()))
+ .collect(
+ Collectors
+ .toMap(
+ p -> p.getQualifier().getClassid() + p.getValue(),
+ Function.identity(),
+ (p1, p2) -> p1,
+ LinkedHashMap::new))
+ .values()
+ .stream()
+ .collect(Collectors.toList()));
+ }
+ }
+ }
+ if (value instanceof Publication) {
+
+ } else if (value instanceof Dataset) {
+
+ } else if (value instanceof OtherResearchProduct) {
+
+ } else if (value instanceof Software) {
+
+ }
+ }
+
+ return value;
+ }
+
+ private static Optional cleanDateField(Field dateofacceptance) {
+ return Optional
+ .ofNullable(dateofacceptance)
+ .map(Field::getValue)
+ .map(GraphCleaningFunctions::cleanDate)
+ .filter(Objects::nonNull);
+ }
+
+ protected static Optional doCleanDate(String date) {
+ return Optional.ofNullable(cleanDate(date));
+ }
+
+ public static String cleanDate(final String inputDate) {
+
+ if (StringUtils.isBlank(inputDate)) {
+ return null;
+ }
+
+ try {
+ final LocalDate date = DateParserUtils
+ .parseDate(inputDate.trim())
+ .toInstant()
+ .atZone(ZoneId.systemDefault())
+ .toLocalDate();
+ return DateTimeFormatter.ofPattern(ModelSupport.DATE_FORMAT).format(date);
+ } catch (DateTimeParseException e) {
+ return null;
+ }
+ }
+
+ // HELPERS
+
+ private static boolean isValidAuthorName(Author a) {
+ return !Stream
+ .of(a.getFullname(), a.getName(), a.getSurname())
+ .filter(s -> s != null && !s.isEmpty())
+ .collect(Collectors.joining(""))
+ .toLowerCase()
+ .matches(INVALID_AUTHOR_REGEX);
+ }
+
+ private static List processPidCleaning(List pids) {
+ return pids
+ .stream()
+ .filter(Objects::nonNull)
+ .filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
+ .filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
+ .filter(sp -> Objects.nonNull(sp.getQualifier()))
+ .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
+ .map(CleaningFunctions::normalizePidValue)
+ .filter(CleaningFunctions::pidFilter)
+ .collect(Collectors.toList());
+ }
+
+ private static void fixVocabName(Qualifier q, String vocabularyName) {
+ if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
+ q.setSchemeid(vocabularyName);
+ q.setSchemename(vocabularyName);
+ }
+ }
+
+ private static AccessRight accessRight(String classid, String classname, String scheme) {
+ return OafMapperUtils
+ .accessRight(
+ classid, classname, scheme, scheme);
+ }
+
+ private static Qualifier qualifier(String classid, String classname, String scheme) {
+ return OafMapperUtils
+ .qualifier(
+ classid, classname, scheme, scheme);
+ }
+
+ protected static StructuredProperty cleanValue(StructuredProperty s) {
+ s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
+ return s;
+ }
+
+ protected static Field cleanValue(Field s) {
+ s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
+ return s;
+ }
+
+}
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java
new file mode 100644
index 000000000..c6a8fd5a7
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java
@@ -0,0 +1,368 @@
+
+package eu.dnetlib.dhp.schema.oaf.utils;
+
+import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
+
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+
+import org.apache.commons.lang3.StringUtils;
+
+import eu.dnetlib.dhp.schema.common.AccessRightComparator;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.*;
+
+public class OafMapperUtils {
+
+ public static Oaf merge(final Oaf left, final Oaf right) {
+ if (ModelSupport.isSubClass(left, OafEntity.class)) {
+ return mergeEntities((OafEntity) left, (OafEntity) right);
+ } else if (ModelSupport.isSubClass(left, Relation.class)) {
+ ((Relation) left).mergeFrom((Relation) right);
+ } else {
+ throw new RuntimeException("invalid Oaf type:" + left.getClass().getCanonicalName());
+ }
+ return left;
+ }
+
+ public static OafEntity mergeEntities(OafEntity left, OafEntity right) {
+ if (ModelSupport.isSubClass(left, Result.class)) {
+ return mergeResults((Result) left, (Result) right);
+ } else if (ModelSupport.isSubClass(left, Datasource.class)) {
+ left.mergeFrom(right);
+ } else if (ModelSupport.isSubClass(left, Organization.class)) {
+ left.mergeFrom(right);
+ } else if (ModelSupport.isSubClass(left, Project.class)) {
+ left.mergeFrom(right);
+ } else {
+ throw new RuntimeException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
+ }
+ return left;
+ }
+
+ public static Result mergeResults(Result left, Result right) {
+ if (new ResultTypeComparator().compare(left, right) < 0) {
+ left.mergeFrom(right);
+ return left;
+ } else {
+ right.mergeFrom(left);
+ return right;
+ }
+ }
+
+ public static KeyValue keyValue(final String k, final String v) {
+ final KeyValue kv = new KeyValue();
+ kv.setKey(k);
+ kv.setValue(v);
+ return kv;
+ }
+
+ public static List listKeyValues(final String... s) {
+ if (s.length % 2 > 0) {
+ throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)");
+ }
+
+ final List list = new ArrayList<>();
+ for (int i = 0; i < s.length; i += 2) {
+ list.add(keyValue(s[i], s[i + 1]));
+ }
+ return list;
+ }
+
+ public static Field field(final T value, final DataInfo info) {
+ if (value == null || StringUtils.isBlank(value.toString())) {
+ return null;
+ }
+
+ final Field field = new Field<>();
+ field.setValue(value);
+ field.setDataInfo(info);
+ return field;
+ }
+
+ public static List> listFields(final DataInfo info, final String... values) {
+ return Arrays
+ .stream(values)
+ .map(v -> field(v, info))
+ .filter(Objects::nonNull)
+ .filter(distinctByKey(f -> f.getValue()))
+ .collect(Collectors.toList());
+ }
+
+ public static List> listFields(final DataInfo info, final List values) {
+ return values
+ .stream()
+ .map(v -> field(v, info))
+ .filter(Objects::nonNull)
+ .filter(distinctByKey(f -> f.getValue()))
+ .collect(Collectors.toList());
+ }
+
+ public static Qualifier unknown(final String schemeid, final String schemename) {
+ return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
+ }
+
+ public static AccessRight accessRight(
+ final String classid,
+ final String classname,
+ final String schemeid,
+ final String schemename) {
+ return accessRight(classid, classname, schemeid, schemename, null);
+ }
+
+ public static AccessRight accessRight(
+ final String classid,
+ final String classname,
+ final String schemeid,
+ final String schemename,
+ final OpenAccessRoute openAccessRoute) {
+ final AccessRight accessRight = new AccessRight();
+ accessRight.setClassid(classid);
+ accessRight.setClassname(classname);
+ accessRight.setSchemeid(schemeid);
+ accessRight.setSchemename(schemename);
+ accessRight.setOpenAccessRoute(openAccessRoute);
+ return accessRight;
+ }
+
+ public static Qualifier qualifier(
+ final String classid,
+ final String classname,
+ final String schemeid,
+ final String schemename) {
+ final Qualifier q = new Qualifier();
+ q.setClassid(classid);
+ q.setClassname(classname);
+ q.setSchemeid(schemeid);
+ q.setSchemename(schemename);
+ return q;
+ }
+
+ public static Qualifier qualifier(final Qualifier qualifier) {
+ final Qualifier q = new Qualifier();
+ q.setClassid(qualifier.getClassid());
+ q.setClassname(qualifier.getClassname());
+ q.setSchemeid(qualifier.getSchemeid());
+ q.setSchemename(qualifier.getSchemename());
+ return q;
+ }
+
+ public static StructuredProperty structuredProperty(
+ final String value,
+ final String classid,
+ final String classname,
+ final String schemeid,
+ final String schemename,
+ final DataInfo dataInfo) {
+
+ return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
+ }
+
+ public static StructuredProperty structuredProperty(
+ final String value,
+ final Qualifier qualifier,
+ final DataInfo dataInfo) {
+ if (value == null) {
+ return null;
+ }
+ final StructuredProperty sp = new StructuredProperty();
+ sp.setValue(value);
+ sp.setQualifier(qualifier);
+ sp.setDataInfo(dataInfo);
+ return sp;
+ }
+
+ public static ExtraInfo extraInfo(
+ final String name,
+ final String value,
+ final String typology,
+ final String provenance,
+ final String trust) {
+ final ExtraInfo info = new ExtraInfo();
+ info.setName(name);
+ info.setValue(value);
+ info.setTypology(typology);
+ info.setProvenance(provenance);
+ info.setTrust(trust);
+ return info;
+ }
+
+ public static OAIProvenance oaiIProvenance(
+ final String identifier,
+ final String baseURL,
+ final String metadataNamespace,
+ final Boolean altered,
+ final String datestamp,
+ final String harvestDate) {
+
+ final OriginDescription desc = new OriginDescription();
+ desc.setIdentifier(identifier);
+ desc.setBaseURL(baseURL);
+ desc.setMetadataNamespace(metadataNamespace);
+ desc.setAltered(altered);
+ desc.setDatestamp(datestamp);
+ desc.setHarvestDate(harvestDate);
+
+ final OAIProvenance p = new OAIProvenance();
+ p.setOriginDescription(desc);
+
+ return p;
+ }
+
+ public static Journal journal(
+ final String name,
+ final String issnPrinted,
+ final String issnOnline,
+ final String issnLinking,
+ final DataInfo dataInfo) {
+
+ return hasIssn(issnPrinted, issnOnline, issnLinking) ? journal(
+ name,
+ issnPrinted,
+ issnOnline,
+ issnLinking,
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ dataInfo) : null;
+ }
+
+ public static Journal journal(
+ final String name,
+ final String issnPrinted,
+ final String issnOnline,
+ final String issnLinking,
+ final String ep,
+ final String iss,
+ final String sp,
+ final String vol,
+ final String edition,
+ final String conferenceplace,
+ final String conferencedate,
+ final DataInfo dataInfo) {
+
+ if (StringUtils.isNotBlank(name) || hasIssn(issnPrinted, issnOnline, issnLinking)) {
+ final Journal j = new Journal();
+ j.setName(name);
+ j.setIssnPrinted(issnPrinted);
+ j.setIssnOnline(issnOnline);
+ j.setIssnLinking(issnLinking);
+ j.setEp(ep);
+ j.setIss(iss);
+ j.setSp(sp);
+ j.setVol(vol);
+ j.setEdition(edition);
+ j.setConferenceplace(conferenceplace);
+ j.setConferencedate(conferencedate);
+ j.setDataInfo(dataInfo);
+ return j;
+ } else {
+ return null;
+ }
+ }
+
+ private static boolean hasIssn(String issnPrinted, String issnOnline, String issnLinking) {
+ return StringUtils.isNotBlank(issnPrinted)
+ || StringUtils.isNotBlank(issnOnline)
+ || StringUtils.isNotBlank(issnLinking);
+ }
+
+ public static DataInfo dataInfo(
+ final Boolean deletedbyinference,
+ final String inferenceprovenance,
+ final Boolean inferred,
+ final Boolean invisible,
+ final Qualifier provenanceaction,
+ final String trust) {
+ final DataInfo d = new DataInfo();
+ d.setDeletedbyinference(deletedbyinference);
+ d.setInferenceprovenance(inferenceprovenance);
+ d.setInferred(inferred);
+ d.setInvisible(invisible);
+ d.setProvenanceaction(provenanceaction);
+ d.setTrust(trust);
+ return d;
+ }
+
+ public static String createOpenaireId(
+ final int prefix,
+ final String originalId,
+ final boolean to_md5) {
+ if (StringUtils.isBlank(originalId)) {
+ return null;
+ } else if (to_md5) {
+ final String nsPrefix = StringUtils.substringBefore(originalId, "::");
+ final String rest = StringUtils.substringAfter(originalId, "::");
+ return String.format("%s|%s::%s", prefix, nsPrefix, IdentifierFactory.md5(rest));
+ } else {
+ return String.format("%s|%s", prefix, originalId);
+ }
+ }
+
+ public static String createOpenaireId(
+ final String type,
+ final String originalId,
+ final boolean to_md5) {
+ switch (type) {
+ case "datasource":
+ return createOpenaireId(10, originalId, to_md5);
+ case "organization":
+ return createOpenaireId(20, originalId, to_md5);
+ case "person":
+ return createOpenaireId(30, originalId, to_md5);
+ case "project":
+ return createOpenaireId(40, originalId, to_md5);
+ default:
+ return createOpenaireId(50, originalId, to_md5);
+ }
+ }
+
+ public static String asString(final Object o) {
+ return o == null ? "" : o.toString();
+ }
+
+ public static Predicate distinctByKey(
+ final Function super T, ?> keyExtractor) {
+ final Map