forked from antonis.lempesis/dnet-hadoop
mergin with branch beta
This commit is contained in:
commit
b226ba4439
|
@ -1,4 +1,4 @@
|
||||||
<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Generate DOIBoost ActionSet for BETA - PREPROCESS" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
<workflow-app name="Generate DOIBoost ActionSet for PROD" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Generate DOIBoost ActionSet for BETA - PROCESS" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
|
|
|
@ -71,6 +71,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
private final boolean shouldHashId;
|
private final boolean shouldHashId;
|
||||||
|
|
||||||
|
private final boolean forceOriginalId;
|
||||||
|
|
||||||
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
|
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
|
||||||
protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/";
|
protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/";
|
||||||
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
||||||
|
@ -98,11 +100,20 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3);
|
nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
|
||||||
|
final boolean shouldHashId, final boolean forceOriginalId) {
|
||||||
|
this.vocs = vocs;
|
||||||
|
this.invisible = invisible;
|
||||||
|
this.shouldHashId = shouldHashId;
|
||||||
|
this.forceOriginalId = forceOriginalId;
|
||||||
|
}
|
||||||
|
|
||||||
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
|
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
|
||||||
final boolean shouldHashId) {
|
final boolean shouldHashId) {
|
||||||
this.vocs = vocs;
|
this.vocs = vocs;
|
||||||
this.invisible = invisible;
|
this.invisible = invisible;
|
||||||
this.shouldHashId = shouldHashId;
|
this.shouldHashId = shouldHashId;
|
||||||
|
this.forceOriginalId = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Oaf> processMdRecord(final String xml) {
|
public List<Oaf> processMdRecord(final String xml) {
|
||||||
|
@ -190,10 +201,16 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
final long lastUpdateTimestamp) {
|
final long lastUpdateTimestamp) {
|
||||||
|
|
||||||
final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||||
final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
|
|
||||||
if (!id.equals(entity.getId())) {
|
final Set<String> originalId = Sets.newHashSet(entity.getOriginalId());
|
||||||
entity.getOriginalId().add(entity.getId());
|
originalId.add(entity.getId());
|
||||||
entity.setId(id);
|
entity.setOriginalId(Lists.newArrayList(originalId));
|
||||||
|
|
||||||
|
if (!forceOriginalId) {
|
||||||
|
final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
|
||||||
|
if (!id.equals(entity.getId())) {
|
||||||
|
entity.setId(id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<Oaf> oafs = Lists.newArrayList(entity);
|
final List<Oaf> oafs = Lists.newArrayList(entity);
|
||||||
|
|
|
@ -163,11 +163,13 @@ public class GenerateEntitiesApplication {
|
||||||
|
|
||||||
switch (type.toLowerCase()) {
|
switch (type.toLowerCase()) {
|
||||||
case "oaf-store-cleaned":
|
case "oaf-store-cleaned":
|
||||||
case "oaf-store-claim":
|
|
||||||
return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
|
return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
|
||||||
|
case "oaf-store-claim":
|
||||||
|
return new OafToOafMapper(vocs, false, shouldHashId, true).processMdRecord(s);
|
||||||
case "odf-store-cleaned":
|
case "odf-store-cleaned":
|
||||||
case "odf-store-claim":
|
|
||||||
return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
|
return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
|
||||||
|
case "odf-store-claim":
|
||||||
|
return new OdfToOafMapper(vocs, false, shouldHashId, true).processMdRecord(s);
|
||||||
case "oaf-store-intersection":
|
case "oaf-store-intersection":
|
||||||
return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
|
return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
|
||||||
case "odf-store-intersection":
|
case "odf-store-intersection":
|
||||||
|
|
|
@ -27,6 +27,11 @@ import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
||||||
|
|
||||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
|
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
|
||||||
|
final boolean forceOrginalId) {
|
||||||
|
super(vocs, invisible, shouldHashId, forceOrginalId);
|
||||||
|
}
|
||||||
|
|
||||||
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
|
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
|
||||||
super(vocs, invisible, shouldHashId);
|
super(vocs, invisible, shouldHashId);
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
|
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
|
||||||
|
|
||||||
|
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
|
||||||
|
final boolean forceOrginalId) {
|
||||||
|
super(vocs, invisible, shouldHashId, forceOrginalId);
|
||||||
|
}
|
||||||
|
|
||||||
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
|
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
|
||||||
super(vocs, invisible, shouldHashId);
|
super(vocs, invisible, shouldHashId);
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,6 +16,7 @@ import javax.xml.transform.*;
|
||||||
import javax.xml.transform.dom.DOMSource;
|
import javax.xml.transform.dom.DOMSource;
|
||||||
import javax.xml.transform.stream.StreamResult;
|
import javax.xml.transform.stream.StreamResult;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
|
@ -183,6 +184,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
.getOriginalId()
|
.getOriginalId()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
|
.filter(id -> !id.matches("^\\d{2}" + IdentifierFactory.ID_PREFIX_SEPARATOR))
|
||||||
.map(s -> XmlSerializationUtils.asXmlElement("originalId", s))
|
.map(s -> XmlSerializationUtils.asXmlElement("originalId", s))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue