GenerateEntitiesApplication can be configured to hash the id value or not

This commit is contained in:
Claudio Atzori 2020-11-30 12:00:38 +01:00
parent 758d27745d
commit 2c407e775e
8 changed files with 82 additions and 49 deletions

View File

@ -8,7 +8,7 @@ import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
@ -31,17 +31,16 @@ public class IdentifierFactory implements Serializable {
"(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)";
public static final int ID_PREFIX_LEN = 12;
public static final String NONE = "none";
/**
* Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id
* when no PID is available
* @param entity the entity providing PIDs and a default ID.
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
* @param md5 indicates whether should hash the PID value or not.
* @return an identifier from the most relevant PID, entity.id otherwise
*/
public static <T extends OafEntity> String createIdentifier(T entity) {
public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) {
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
return entity.getId();
}
@ -69,15 +68,27 @@ public class IdentifierFactory implements Serializable {
.stream()
.sorted(new PidValueComparator())
.findFirst()
.map(s -> idFromPid(entity, s))
.map(s -> idFromPid(entity, s, md5))
.orElseGet(entity::getId))
.orElseGet(entity::getId))
.orElseGet(entity::getId);
}
/**
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
*/
public static <T extends OafEntity> String createIdentifier(T entity) {
return createIdentifier(entity, true);
}
protected static boolean pidFilter(StructuredProperty s) {
if (Objects.isNull(s.getQualifier()) ||
StringUtils.isBlank(StringUtils.trim(s.getValue()))) {
StringUtils.isBlank(s.getValue()) ||
StringUtils.isBlank(s.getValue().replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
return false;
}
if (CleaningFunctions.PID_BLACKLIST.contains(StringUtils.trim(s.getValue().toLowerCase()))) {
return false;
}
try {
@ -93,21 +104,14 @@ public class IdentifierFactory implements Serializable {
}
}
private static String verifyIdSyntax(String s) {
if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
throw new RuntimeException(String.format("malformed id: '%s'", s));
} else {
return s;
}
}
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s) {
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
final String value = CleaningFunctions.normalizePidValue(s).getValue();
return new StringBuilder()
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
.append(ID_PREFIX_SEPARATOR)
.append(createPrefix(s.getQualifier().getClassid()))
.append(ID_SEPARATOR)
.append(DHPUtils.md5(CleaningFunctions.normalizePidValue(s).getValue()))
.append(md5 ? DHPUtils.md5(value) : value)
.toString();
}

View File

@ -22,24 +22,41 @@ public class IdentifierFactoryTest {
@Test
public void testCreateIdentifierForPublication() throws IOException {
verifyIdentifier("publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
verifyIdentifier("publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013"));
verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
verifyIdentifier(
"publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"), true);
verifyIdentifier(
"publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013"), true);
verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329"), true);
verifyIdentifier(
"publication_urn1.json",
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"), true);
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
verifyIdentifier("publication_3.json", defaultID);
verifyIdentifier("publication_4.json", defaultID);
verifyIdentifier("publication_5.json", defaultID);
verifyIdentifier("publication_3.json", defaultID, true);
verifyIdentifier("publication_4.json", defaultID, true);
verifyIdentifier("publication_5.json", defaultID, true);
}
protected void verifyIdentifier(String filename, String expectedID) throws IOException {
@Test
public void testCreateIdentifierForPublicationNoHash() throws IOException {
verifyIdentifier("publication_doi1.json", "50|doi_________::10.1016/j.cmet.2011.03.013", false);
verifyIdentifier("publication_doi2.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false);
verifyIdentifier("publication_pmc1.json", "50|pmc_________::21459329", false);
verifyIdentifier(
"publication_urn1.json", "50|urn_________::urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2", false);
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
verifyIdentifier("publication_3.json", defaultID, false);
verifyIdentifier("publication_4.json", defaultID, false);
verifyIdentifier("publication_5.json", defaultID, false);
}
protected void verifyIdentifier(String filename, String expectedID, boolean md5) throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
String id = IdentifierFactory.createIdentifier(pub);
String id = IdentifierFactory.createIdentifier(pub, md5);
assertNotNull(id);
assertEquals(expectedID, id);

View File

@ -26,6 +26,8 @@ public abstract class AbstractMdRecordToOafMapper {
private final boolean invisible;
private final boolean shouldHashId;
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/";
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
@ -50,9 +52,11 @@ public abstract class AbstractMdRecordToOafMapper {
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
"main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
final boolean shouldHashId) {
this.vocs = vocs;
this.invisible = invisible;
this.shouldHashId = shouldHashId;
}
public List<Oaf> processMdRecord(final String xml) {
@ -137,7 +141,7 @@ public abstract class AbstractMdRecordToOafMapper {
final long lastUpdateTimestamp) {
final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
final String id = IdentifierFactory.createIdentifier(entity);
final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
if (!id.equals(entity.getId())) {
entity.getOriginalId().add(entity.getId());
entity.setId(id);

View File

@ -64,13 +64,19 @@ public class GenerateEntitiesApplication {
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final boolean shouldHashId = Optional
.ofNullable(parser.get("shouldHashId"))
.map(Boolean::valueOf)
.orElse(true);
log.info("shouldHashId: {}", shouldHashId);
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
HdfsSupport.remove(targetPath, spark.sparkContext().hadoopConfiguration());
generateEntities(spark, vocs, sourcePaths, targetPath);
generateEntities(spark, vocs, sourcePaths, targetPath, shouldHashId);
});
}
@ -78,7 +84,8 @@ public class GenerateEntitiesApplication {
final SparkSession spark,
final VocabularyGroup vocs,
final String sourcePaths,
final String targetPath) {
final String targetPath,
final boolean shouldHashId) {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final List<String> existingSourcePaths = Arrays
@ -97,7 +104,7 @@ public class GenerateEntitiesApplication {
sc
.sequenceFile(sp, Text.class, Text.class)
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
.map(k -> convertToListOaf(k._1(), k._2(), vocs))
.map(k -> convertToListOaf(k._1(), k._2(), shouldHashId, vocs))
.filter(Objects::nonNull)
.flatMap(list -> list.iterator()));
}
@ -113,20 +120,21 @@ public class GenerateEntitiesApplication {
private static List<Oaf> convertToListOaf(
final String id,
final String s,
final boolean shouldHashId,
final VocabularyGroup vocs) {
final String type = StringUtils.substringAfter(id, ":");
switch (type.toLowerCase()) {
case "oaf-store-cleaned":
case "oaf-store-claim":
return new OafToOafMapper(vocs, false).processMdRecord(s);
return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
case "odf-store-cleaned":
case "odf-store-claim":
return new OdfToOafMapper(vocs, false).processMdRecord(s);
return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
case "oaf-store-intersection":
return new OafToOafMapper(vocs, true).processMdRecord(s);
return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
case "odf-store-intersection":
return new OdfToOafMapper(vocs, true).processMdRecord(s);
return new OdfToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
case "datasource":
return Arrays.asList(convertFromJson(s, Datasource.class));
case "organization":

View File

@ -22,8 +22,8 @@ import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
super(vocs, invisible);
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
super(vocs, invisible, shouldHashId);
}
@Override

View File

@ -19,8 +19,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
super(vocs, invisible);
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
super(vocs, invisible, shouldHashId);
}
@Override

View File

@ -77,7 +77,7 @@ public class GenerateEntitiesApplicationTest {
protected <T extends Result> Result getResult(String xmlFileName, Class<T> clazz) throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName));
return new OdfToOafMapper(vocs, false)
return new OdfToOafMapper(vocs, false, true)
.processMdRecord(xml)
.stream()
.filter(s -> clazz.isAssignableFrom(s.getClass()))

View File

@ -52,7 +52,7 @@ public class MappersTest {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
final List<Oaf> list = new OafToOafMapper(vocs, false).processMdRecord(xml);
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
assertEquals(3, list.size());
assertTrue(list.get(0) instanceof Publication);
@ -131,7 +131,7 @@ public class MappersTest {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
final List<Oaf> list = new OafToOafMapper(vocs, true).processMdRecord(xml);
final List<Oaf> list = new OafToOafMapper(vocs, true, true).processMdRecord(xml);
assertTrue(list.size() > 0);
assertTrue(list.get(0) instanceof Publication);
@ -146,7 +146,7 @@ public class MappersTest {
void testDataset() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
assertEquals(3, list.size());
assertTrue(list.get(0) instanceof Dataset);
@ -240,7 +240,7 @@ public class MappersTest {
void testSoftware() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
assertEquals(1, list.size());
assertTrue(list.get(0) instanceof Software);
@ -259,7 +259,7 @@ public class MappersTest {
void testDataset_2() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset_2.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
@ -269,7 +269,7 @@ public class MappersTest {
@Test
void testClaimDedup() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_dedup.xml"));
final List<Oaf> list = new OafToOafMapper(vocs, false).processMdRecord(xml);
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
@ -279,7 +279,7 @@ public class MappersTest {
@Test
void testNakala() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_nakala.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
@ -303,7 +303,7 @@ public class MappersTest {
@Test
void testClaimFromCrossref() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml"));
final List<Oaf> list = new OafToOafMapper(vocs, false).processMdRecord(xml);
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
@ -319,7 +319,7 @@ public class MappersTest {
@Test
void testODFRecord() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_record.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
@ -333,7 +333,7 @@ public class MappersTest {
@Test
void testTextGrid() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("textgrid.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));