mergin with branch beta

This commit is contained in:
Miriam Baglioni 2021-07-21 09:46:40 +02:00
commit b226ba4439
7 changed files with 39 additions and 8 deletions

View File

@ -1,4 +1,4 @@
<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Generate DOIBoost ActionSet for BETA - PREPROCESS" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>

View File

@ -1,4 +1,4 @@
<workflow-app name="Generate DOIBoost ActionSet for PROD" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Generate DOIBoost ActionSet for BETA - PROCESS" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>

View File

@ -71,6 +71,8 @@ public abstract class AbstractMdRecordToOafMapper {
private final boolean shouldHashId; private final boolean shouldHashId;
private final boolean forceOriginalId;
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/"; protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/";
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
@ -98,11 +100,20 @@ public abstract class AbstractMdRecordToOafMapper {
nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3);
} }
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
final boolean shouldHashId, final boolean forceOriginalId) {
this.vocs = vocs;
this.invisible = invisible;
this.shouldHashId = shouldHashId;
this.forceOriginalId = forceOriginalId;
}
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
final boolean shouldHashId) { final boolean shouldHashId) {
this.vocs = vocs; this.vocs = vocs;
this.invisible = invisible; this.invisible = invisible;
this.shouldHashId = shouldHashId; this.shouldHashId = shouldHashId;
this.forceOriginalId = false;
} }
public List<Oaf> processMdRecord(final String xml) { public List<Oaf> processMdRecord(final String xml) {
@ -190,10 +201,16 @@ public abstract class AbstractMdRecordToOafMapper {
final long lastUpdateTimestamp) { final long lastUpdateTimestamp) {
final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
if (!id.equals(entity.getId())) { final Set<String> originalId = Sets.newHashSet(entity.getOriginalId());
entity.getOriginalId().add(entity.getId()); originalId.add(entity.getId());
entity.setId(id); entity.setOriginalId(Lists.newArrayList(originalId));
if (!forceOriginalId) {
final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
if (!id.equals(entity.getId())) {
entity.setId(id);
}
} }
final List<Oaf> oafs = Lists.newArrayList(entity); final List<Oaf> oafs = Lists.newArrayList(entity);

View File

@ -163,11 +163,13 @@ public class GenerateEntitiesApplication {
switch (type.toLowerCase()) { switch (type.toLowerCase()) {
case "oaf-store-cleaned": case "oaf-store-cleaned":
case "oaf-store-claim":
return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s); return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
case "oaf-store-claim":
return new OafToOafMapper(vocs, false, shouldHashId, true).processMdRecord(s);
case "odf-store-cleaned": case "odf-store-cleaned":
case "odf-store-claim":
return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s); return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
case "odf-store-claim":
return new OdfToOafMapper(vocs, false, shouldHashId, true).processMdRecord(s);
case "oaf-store-intersection": case "oaf-store-intersection":
return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s); return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
case "odf-store-intersection": case "odf-store-intersection":

View File

@ -27,6 +27,11 @@ import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
public class OafToOafMapper extends AbstractMdRecordToOafMapper { public class OafToOafMapper extends AbstractMdRecordToOafMapper {
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
final boolean forceOrginalId) {
super(vocs, invisible, shouldHashId, forceOrginalId);
}
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) { public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
super(vocs, invisible, shouldHashId); super(vocs, invisible, shouldHashId);
} }

View File

@ -22,6 +22,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/"; public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
final boolean forceOrginalId) {
super(vocs, invisible, shouldHashId, forceOrginalId);
}
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) { public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
super(vocs, invisible, shouldHashId); super(vocs, invisible, shouldHashId);
} }

View File

@ -16,6 +16,7 @@ import javax.xml.transform.*;
import javax.xml.transform.dom.DOMSource; import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamResult;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document; import org.dom4j.Document;
@ -183,6 +184,7 @@ public class XmlRecordFactory implements Serializable {
.getOriginalId() .getOriginalId()
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(id -> !id.matches("^\\d{2}" + IdentifierFactory.ID_PREFIX_SEPARATOR))
.map(s -> XmlSerializationUtils.asXmlElement("originalId", s)) .map(s -> XmlSerializationUtils.asXmlElement("originalId", s))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }