From 0977baf41d26be303200ffdb8ed7fb224b53db63 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 19 Jul 2021 17:43:52 +0200 Subject: [PATCH 1/5] contents mapped from the stores with 'claim' interpretation will not change their identifier along their way towards the graph --- .../raw/AbstractMdRecordToOafMapper.java | 24 +++++++++++++++---- .../raw/GenerateEntitiesApplication.java | 6 +++-- .../dhp/oa/graph/raw/OafToOafMapper.java | 5 ++++ .../dhp/oa/graph/raw/OdfToOafMapper.java | 5 ++++ 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 1e80dfd46c..cbaadf85c0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -71,6 +71,8 @@ public abstract class AbstractMdRecordToOafMapper { private final boolean shouldHashId; + private final boolean forceOriginalId; + protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; @@ -98,11 +100,20 @@ public abstract class AbstractMdRecordToOafMapper { nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); } + protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, + final boolean shouldHashId, final boolean forceOriginalId) { + this.vocs = vocs; + this.invisible = invisible; + this.shouldHashId = shouldHashId; + this.forceOriginalId = forceOriginalId; + } + protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) { this.vocs = vocs; this.invisible = invisible; this.shouldHashId = shouldHashId; + this.forceOriginalId = false; } public List processMdRecord(final String xml) { @@ -190,10 +201,15 @@ public abstract class AbstractMdRecordToOafMapper { final long lastUpdateTimestamp) { final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); - final String id = IdentifierFactory.createIdentifier(entity, shouldHashId); - if (!id.equals(entity.getId())) { - entity.getOriginalId().add(entity.getId()); - entity.setId(id); + + if (!forceOriginalId) { + final String id = IdentifierFactory.createIdentifier(entity, shouldHashId); + if (!id.equals(entity.getId())) { + final Set originalId = Sets.newHashSet(entity.getOriginalId()); + originalId.add(entity.getId()); + entity.setOriginalId(Lists.newArrayList(originalId)); + entity.setId(id); + } } final List oafs = Lists.newArrayList(entity); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index fcd6f459a7..bbfb7429f3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -163,11 +163,13 @@ public class GenerateEntitiesApplication { switch (type.toLowerCase()) { case "oaf-store-cleaned": - case "oaf-store-claim": return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s); + case "oaf-store-claim": + return new OafToOafMapper(vocs, false, shouldHashId, true).processMdRecord(s); case "odf-store-cleaned": - case "odf-store-claim": return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s); + case "odf-store-claim": + return new OdfToOafMapper(vocs, false, shouldHashId, true).processMdRecord(s); case "oaf-store-intersection": return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s); case "odf-store-intersection": diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 06aeab3451..d753cddeb4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -27,6 +27,11 @@ import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; public class OafToOafMapper extends AbstractMdRecordToOafMapper { + public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId, + final boolean forceOrginalId) { + super(vocs, invisible, shouldHashId, forceOrginalId); + } + public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) { super(vocs, invisible, shouldHashId); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index b7400873b6..7925a78269 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -22,6 +22,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/"; + public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId, + final boolean forceOrginalId) { + super(vocs, invisible, shouldHashId, forceOrginalId); + } + public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) { super(vocs, invisible, shouldHashId); } From 65934888a1ad64226472d70a594d9fe701ee0487 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 19 Jul 2021 17:52:24 +0200 Subject: [PATCH 2/5] adding record identifier among the originalIds regardless of what IdentifierFactory produces --- .../dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index cbaadf85c0..03c3eeb3ce 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -202,12 +202,13 @@ public abstract class AbstractMdRecordToOafMapper { final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); + final Set originalId = Sets.newHashSet(entity.getOriginalId()); + originalId.add(entity.getId()); + entity.setOriginalId(Lists.newArrayList(originalId)); + if (!forceOriginalId) { final String id = IdentifierFactory.createIdentifier(entity, shouldHashId); if (!id.equals(entity.getId())) { - final Set originalId = Sets.newHashSet(entity.getOriginalId()); - originalId.add(entity.getId()); - entity.setOriginalId(Lists.newArrayList(originalId)); entity.setId(id); } } From 54acc5373b6e32071afa837bae209fcfcbd22500 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 19 Jul 2021 18:18:09 +0200 Subject: [PATCH 3/5] changed the name of the workflows --- .../eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml index 03f7b7566c..52f958d4d9 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + sparkDriverMemory From 83fe31c92e0588de7c1176a004751e0f32cbad2c Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 19 Jul 2021 18:19:14 +0200 Subject: [PATCH 4/5] changed the name of the workflows --- .../eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml index f845d97f30..8793f339ca 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + sparkDriverMemory From 10d7b4f0b4f3a5e9b2eb0ab8b40b4560dc1d4069 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 20 Jul 2021 11:51:33 +0200 Subject: [PATCH 5/5] filtering 'old' OpenAIRE ids from the entity.originalId[] array in the OAF -> XML searialization procedure --- .../eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index 86bbae99ec..a985d23718 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -16,6 +16,7 @@ import javax.xml.transform.*; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import org.apache.commons.lang3.StringUtils; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; @@ -183,6 +184,7 @@ public class XmlRecordFactory implements Serializable { .getOriginalId() .stream() .filter(Objects::nonNull) + .filter(id -> !id.matches("^\\d{2}" + IdentifierFactory.ID_PREFIX_SEPARATOR)) .map(s -> XmlSerializationUtils.asXmlElement("originalId", s)) .collect(Collectors.toList())); }