forked from D-Net/dnet-hadoop
mergin with branch beta
This commit is contained in:
commit
b226ba4439
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Generate DOIBoost ActionSet for BETA - PREPROCESS" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="Generate DOIBoost ActionSet for PROD" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Generate DOIBoost ActionSet for BETA - PROCESS" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
|
|
|
@ -71,6 +71,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
|
||||
private final boolean shouldHashId;
|
||||
|
||||
private final boolean forceOriginalId;
|
||||
|
||||
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
|
||||
protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/";
|
||||
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
||||
|
@ -98,11 +100,20 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3);
|
||||
}
|
||||
|
||||
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
|
||||
final boolean shouldHashId, final boolean forceOriginalId) {
|
||||
this.vocs = vocs;
|
||||
this.invisible = invisible;
|
||||
this.shouldHashId = shouldHashId;
|
||||
this.forceOriginalId = forceOriginalId;
|
||||
}
|
||||
|
||||
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
|
||||
final boolean shouldHashId) {
|
||||
this.vocs = vocs;
|
||||
this.invisible = invisible;
|
||||
this.shouldHashId = shouldHashId;
|
||||
this.forceOriginalId = false;
|
||||
}
|
||||
|
||||
public List<Oaf> processMdRecord(final String xml) {
|
||||
|
@ -190,10 +201,16 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
final long lastUpdateTimestamp) {
|
||||
|
||||
final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||
final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
|
||||
if (!id.equals(entity.getId())) {
|
||||
entity.getOriginalId().add(entity.getId());
|
||||
entity.setId(id);
|
||||
|
||||
final Set<String> originalId = Sets.newHashSet(entity.getOriginalId());
|
||||
originalId.add(entity.getId());
|
||||
entity.setOriginalId(Lists.newArrayList(originalId));
|
||||
|
||||
if (!forceOriginalId) {
|
||||
final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
|
||||
if (!id.equals(entity.getId())) {
|
||||
entity.setId(id);
|
||||
}
|
||||
}
|
||||
|
||||
final List<Oaf> oafs = Lists.newArrayList(entity);
|
||||
|
|
|
@ -163,11 +163,13 @@ public class GenerateEntitiesApplication {
|
|||
|
||||
switch (type.toLowerCase()) {
|
||||
case "oaf-store-cleaned":
|
||||
case "oaf-store-claim":
|
||||
return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
|
||||
case "oaf-store-claim":
|
||||
return new OafToOafMapper(vocs, false, shouldHashId, true).processMdRecord(s);
|
||||
case "odf-store-cleaned":
|
||||
case "odf-store-claim":
|
||||
return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
|
||||
case "odf-store-claim":
|
||||
return new OdfToOafMapper(vocs, false, shouldHashId, true).processMdRecord(s);
|
||||
case "oaf-store-intersection":
|
||||
return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
|
||||
case "odf-store-intersection":
|
||||
|
|
|
@ -27,6 +27,11 @@ import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
|||
|
||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
|
||||
final boolean forceOrginalId) {
|
||||
super(vocs, invisible, shouldHashId, forceOrginalId);
|
||||
}
|
||||
|
||||
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
|
||||
super(vocs, invisible, shouldHashId);
|
||||
}
|
||||
|
|
|
@ -22,6 +22,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
|
||||
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
|
||||
|
||||
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
|
||||
final boolean forceOrginalId) {
|
||||
super(vocs, invisible, shouldHashId, forceOrginalId);
|
||||
}
|
||||
|
||||
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
|
||||
super(vocs, invisible, shouldHashId);
|
||||
}
|
||||
|
|
|
@ -16,6 +16,7 @@ import javax.xml.transform.*;
|
|||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.dom4j.Document;
|
||||
|
@ -183,6 +184,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
.getOriginalId()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(id -> !id.matches("^\\d{2}" + IdentifierFactory.ID_PREFIX_SEPARATOR))
|
||||
.map(s -> XmlSerializationUtils.asXmlElement("originalId", s))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue