From 2c407e775ed1401a951737593cc2263bfd8b8d71 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 30 Nov 2020 12:00:38 +0100 Subject: [PATCH] GenerateEntitiesApplication can be configured to hash the id value or not --- .../schema/oaf/utils/IdentifierFactory.java | 36 ++++++++++--------- .../oaf/utils/IdentifierFactoryTest.java | 35 +++++++++++++----- .../raw/AbstractMdRecordToOafMapper.java | 8 +++-- .../raw/GenerateEntitiesApplication.java | 22 ++++++++---- .../dhp/oa/graph/raw/OafToOafMapper.java | 4 +-- .../dhp/oa/graph/raw/OdfToOafMapper.java | 4 +-- .../raw/GenerateEntitiesApplicationTest.java | 2 +- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 20 +++++------ 8 files changed, 82 insertions(+), 49 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java index 01bb92bf6..7acc021b4 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java @@ -8,7 +8,7 @@ import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.schema.oaf.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.OafEntity; @@ -31,17 +31,16 @@ public class IdentifierFactory implements Serializable { "(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)"; public static final int ID_PREFIX_LEN = 12; - public static final String NONE = "none"; /** * Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id * when no PID is available * @param entity the entity providing PIDs and a default ID. * @param the specific entity type. Currently Organization and Result subclasses are supported. + * @param md5 indicates whether should hash the PID value or not. * @return an identifier from the most relevant PID, entity.id otherwise */ - public static String createIdentifier(T entity) { - + public static String createIdentifier(T entity, boolean md5) { if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) { return entity.getId(); } @@ -69,15 +68,27 @@ public class IdentifierFactory implements Serializable { .stream() .sorted(new PidValueComparator()) .findFirst() - .map(s -> idFromPid(entity, s)) + .map(s -> idFromPid(entity, s, md5)) .orElseGet(entity::getId)) .orElseGet(entity::getId)) .orElseGet(entity::getId); } + /** + * @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)} + */ + public static String createIdentifier(T entity) { + + return createIdentifier(entity, true); + } + protected static boolean pidFilter(StructuredProperty s) { if (Objects.isNull(s.getQualifier()) || - StringUtils.isBlank(StringUtils.trim(s.getValue()))) { + StringUtils.isBlank(s.getValue()) || + StringUtils.isBlank(s.getValue().replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) { + return false; + } + if (CleaningFunctions.PID_BLACKLIST.contains(StringUtils.trim(s.getValue().toLowerCase()))) { return false; } try { @@ -93,21 +104,14 @@ public class IdentifierFactory implements Serializable { } } - private static String verifyIdSyntax(String s) { - if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) { - throw new RuntimeException(String.format("malformed id: '%s'", s)); - } else { - return s; - } - } - - private static String idFromPid(T entity, StructuredProperty s) { + private static String idFromPid(T entity, StructuredProperty s, boolean md5) { + final String value = CleaningFunctions.normalizePidValue(s).getValue(); return new StringBuilder() .append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR)) .append(ID_PREFIX_SEPARATOR) .append(createPrefix(s.getQualifier().getClassid())) .append(ID_SEPARATOR) - .append(DHPUtils.md5(CleaningFunctions.normalizePidValue(s).getValue())) + .append(md5 ? DHPUtils.md5(value) : value) .toString(); } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java index 2b34a46ca..17f172a42 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java @@ -22,24 +22,41 @@ public class IdentifierFactoryTest { @Test public void testCreateIdentifierForPublication() throws IOException { - verifyIdentifier("publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013")); - verifyIdentifier("publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013")); - verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329")); + verifyIdentifier( + "publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"), true); + verifyIdentifier( + "publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013"), true); + verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329"), true); verifyIdentifier( "publication_urn1.json", - "50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2")); + "50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"), true); final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"; - verifyIdentifier("publication_3.json", defaultID); - verifyIdentifier("publication_4.json", defaultID); - verifyIdentifier("publication_5.json", defaultID); + verifyIdentifier("publication_3.json", defaultID, true); + verifyIdentifier("publication_4.json", defaultID, true); + verifyIdentifier("publication_5.json", defaultID, true); } - protected void verifyIdentifier(String filename, String expectedID) throws IOException { + @Test + public void testCreateIdentifierForPublicationNoHash() throws IOException { + + verifyIdentifier("publication_doi1.json", "50|doi_________::10.1016/j.cmet.2011.03.013", false); + verifyIdentifier("publication_doi2.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false); + verifyIdentifier("publication_pmc1.json", "50|pmc_________::21459329", false); + verifyIdentifier( + "publication_urn1.json", "50|urn_________::urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2", false); + + final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"; + verifyIdentifier("publication_3.json", defaultID, false); + verifyIdentifier("publication_4.json", defaultID, false); + verifyIdentifier("publication_5.json", defaultID, false); + } + + protected void verifyIdentifier(String filename, String expectedID, boolean md5) throws IOException { final String json = IOUtils.toString(getClass().getResourceAsStream(filename)); final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class); - String id = IdentifierFactory.createIdentifier(pub); + String id = IdentifierFactory.createIdentifier(pub, md5); assertNotNull(id); assertEquals(expectedID, id); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 9db56198f..00a7f3a92 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -26,6 +26,8 @@ public abstract class AbstractMdRecordToOafMapper { private final boolean invisible; + private final boolean shouldHashId; + protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; @@ -50,9 +52,11 @@ public abstract class AbstractMdRecordToOafMapper { protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); - protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible) { + protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, + final boolean shouldHashId) { this.vocs = vocs; this.invisible = invisible; + this.shouldHashId = shouldHashId; } public List processMdRecord(final String xml) { @@ -137,7 +141,7 @@ public abstract class AbstractMdRecordToOafMapper { final long lastUpdateTimestamp) { final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); - final String id = IdentifierFactory.createIdentifier(entity); + final String id = IdentifierFactory.createIdentifier(entity, shouldHashId); if (!id.equals(entity.getId())) { entity.getOriginalId().add(entity.getId()); entity.setId(id); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 2d32f62af..40020427a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -64,13 +64,19 @@ public class GenerateEntitiesApplication { final String isLookupUrl = parser.get("isLookupUrl"); log.info("isLookupUrl: {}", isLookupUrl); + final boolean shouldHashId = Optional + .ofNullable(parser.get("shouldHashId")) + .map(Boolean::valueOf) + .orElse(true); + log.info("shouldHashId: {}", shouldHashId); + final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl); final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService); final SparkConf conf = new SparkConf(); runWithSparkSession(conf, isSparkSessionManaged, spark -> { HdfsSupport.remove(targetPath, spark.sparkContext().hadoopConfiguration()); - generateEntities(spark, vocs, sourcePaths, targetPath); + generateEntities(spark, vocs, sourcePaths, targetPath, shouldHashId); }); } @@ -78,7 +84,8 @@ public class GenerateEntitiesApplication { final SparkSession spark, final VocabularyGroup vocs, final String sourcePaths, - final String targetPath) { + final String targetPath, + final boolean shouldHashId) { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final List existingSourcePaths = Arrays @@ -97,7 +104,7 @@ public class GenerateEntitiesApplication { sc .sequenceFile(sp, Text.class, Text.class) .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .map(k -> convertToListOaf(k._1(), k._2(), vocs)) + .map(k -> convertToListOaf(k._1(), k._2(), shouldHashId, vocs)) .filter(Objects::nonNull) .flatMap(list -> list.iterator())); } @@ -113,20 +120,21 @@ public class GenerateEntitiesApplication { private static List convertToListOaf( final String id, final String s, + final boolean shouldHashId, final VocabularyGroup vocs) { final String type = StringUtils.substringAfter(id, ":"); switch (type.toLowerCase()) { case "oaf-store-cleaned": case "oaf-store-claim": - return new OafToOafMapper(vocs, false).processMdRecord(s); + return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s); case "odf-store-cleaned": case "odf-store-claim": - return new OdfToOafMapper(vocs, false).processMdRecord(s); + return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s); case "oaf-store-intersection": - return new OafToOafMapper(vocs, true).processMdRecord(s); + return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s); case "odf-store-intersection": - return new OdfToOafMapper(vocs, true).processMdRecord(s); + return new OdfToOafMapper(vocs, true, shouldHashId).processMdRecord(s); case "datasource": return Arrays.asList(convertFromJson(s, Datasource.class)); case "organization": diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index e28e8bd3c..50208a079 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -22,8 +22,8 @@ import eu.dnetlib.dhp.schema.oaf.CleaningFunctions; public class OafToOafMapper extends AbstractMdRecordToOafMapper { - public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible) { - super(vocs, invisible); + public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) { + super(vocs, invisible, shouldHashId); } @Override diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 6ceaa405a..88a29fdd7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -19,8 +19,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/"; - public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible) { - super(vocs, invisible); + public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) { + super(vocs, invisible, shouldHashId); } @Override diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java index 705f1dddb..4c1b0a739 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java @@ -77,7 +77,7 @@ public class GenerateEntitiesApplicationTest { protected Result getResult(String xmlFileName, Class clazz) throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName)); - return new OdfToOafMapper(vocs, false) + return new OdfToOafMapper(vocs, false, true) .processMdRecord(xml) .stream() .filter(s -> clazz.isAssignableFrom(s.getClass())) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 4e4a21fa9..5fc3cb5d0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -52,7 +52,7 @@ public class MappersTest { final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); - final List list = new OafToOafMapper(vocs, false).processMdRecord(xml); + final List list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); assertEquals(3, list.size()); assertTrue(list.get(0) instanceof Publication); @@ -131,7 +131,7 @@ public class MappersTest { final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); - final List list = new OafToOafMapper(vocs, true).processMdRecord(xml); + final List list = new OafToOafMapper(vocs, true, true).processMdRecord(xml); assertTrue(list.size() > 0); assertTrue(list.get(0) instanceof Publication); @@ -146,7 +146,7 @@ public class MappersTest { void testDataset() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml")); - final List list = new OdfToOafMapper(vocs, false).processMdRecord(xml); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); assertEquals(3, list.size()); assertTrue(list.get(0) instanceof Dataset); @@ -240,7 +240,7 @@ public class MappersTest { void testSoftware() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml")); - final List list = new OdfToOafMapper(vocs, false).processMdRecord(xml); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); assertEquals(1, list.size()); assertTrue(list.get(0) instanceof Software); @@ -259,7 +259,7 @@ public class MappersTest { void testDataset_2() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset_2.xml")); - final List list = new OdfToOafMapper(vocs, false).processMdRecord(xml); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list)); @@ -269,7 +269,7 @@ public class MappersTest { @Test void testClaimDedup() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_dedup.xml")); - final List list = new OafToOafMapper(vocs, false).processMdRecord(xml); + final List list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list)); @@ -279,7 +279,7 @@ public class MappersTest { @Test void testNakala() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_nakala.xml")); - final List list = new OdfToOafMapper(vocs, false).processMdRecord(xml); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list)); @@ -303,7 +303,7 @@ public class MappersTest { @Test void testClaimFromCrossref() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml")); - final List list = new OafToOafMapper(vocs, false).processMdRecord(xml); + final List list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list)); @@ -319,7 +319,7 @@ public class MappersTest { @Test void testODFRecord() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_record.xml")); - final List list = new OdfToOafMapper(vocs, false).processMdRecord(xml); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println("***************"); @@ -333,7 +333,7 @@ public class MappersTest { @Test void testTextGrid() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("textgrid.xml")); - final List list = new OdfToOafMapper(vocs, false).processMdRecord(xml); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list));