GenerateEntitiesApplication can be configured to hash the id value or not

This commit is contained in:
Claudio Atzori 2020-11-30 12:00:38 +01:00
parent 758d27745d
commit 2c407e775e
8 changed files with 82 additions and 49 deletions

View File

@ -8,7 +8,7 @@ import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.OafEntity;
@ -31,17 +31,16 @@ public class IdentifierFactory implements Serializable {
"(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)"; "(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)";
public static final int ID_PREFIX_LEN = 12; public static final int ID_PREFIX_LEN = 12;
public static final String NONE = "none";
/** /**
* Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id * Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id
* when no PID is available * when no PID is available
* @param entity the entity providing PIDs and a default ID. * @param entity the entity providing PIDs and a default ID.
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported. * @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
* @param md5 indicates whether should hash the PID value or not.
* @return an identifier from the most relevant PID, entity.id otherwise * @return an identifier from the most relevant PID, entity.id otherwise
*/ */
public static <T extends OafEntity> String createIdentifier(T entity) { public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) {
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) { if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
return entity.getId(); return entity.getId();
} }
@ -69,15 +68,27 @@ public class IdentifierFactory implements Serializable {
.stream() .stream()
.sorted(new PidValueComparator()) .sorted(new PidValueComparator())
.findFirst() .findFirst()
.map(s -> idFromPid(entity, s)) .map(s -> idFromPid(entity, s, md5))
.orElseGet(entity::getId)) .orElseGet(entity::getId))
.orElseGet(entity::getId)) .orElseGet(entity::getId))
.orElseGet(entity::getId); .orElseGet(entity::getId);
} }
/**
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
*/
public static <T extends OafEntity> String createIdentifier(T entity) {
return createIdentifier(entity, true);
}
protected static boolean pidFilter(StructuredProperty s) { protected static boolean pidFilter(StructuredProperty s) {
if (Objects.isNull(s.getQualifier()) || if (Objects.isNull(s.getQualifier()) ||
StringUtils.isBlank(StringUtils.trim(s.getValue()))) { StringUtils.isBlank(s.getValue()) ||
StringUtils.isBlank(s.getValue().replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
return false;
}
if (CleaningFunctions.PID_BLACKLIST.contains(StringUtils.trim(s.getValue().toLowerCase()))) {
return false; return false;
} }
try { try {
@ -93,21 +104,14 @@ public class IdentifierFactory implements Serializable {
} }
} }
private static String verifyIdSyntax(String s) { private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) { final String value = CleaningFunctions.normalizePidValue(s).getValue();
throw new RuntimeException(String.format("malformed id: '%s'", s));
} else {
return s;
}
}
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s) {
return new StringBuilder() return new StringBuilder()
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR)) .append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
.append(ID_PREFIX_SEPARATOR) .append(ID_PREFIX_SEPARATOR)
.append(createPrefix(s.getQualifier().getClassid())) .append(createPrefix(s.getQualifier().getClassid()))
.append(ID_SEPARATOR) .append(ID_SEPARATOR)
.append(DHPUtils.md5(CleaningFunctions.normalizePidValue(s).getValue())) .append(md5 ? DHPUtils.md5(value) : value)
.toString(); .toString();
} }

View File

@ -22,24 +22,41 @@ public class IdentifierFactoryTest {
@Test @Test
public void testCreateIdentifierForPublication() throws IOException { public void testCreateIdentifierForPublication() throws IOException {
verifyIdentifier("publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013")); verifyIdentifier(
verifyIdentifier("publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013")); "publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"), true);
verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329")); verifyIdentifier(
"publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013"), true);
verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329"), true);
verifyIdentifier( verifyIdentifier(
"publication_urn1.json", "publication_urn1.json",
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2")); "50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"), true);
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"; final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
verifyIdentifier("publication_3.json", defaultID); verifyIdentifier("publication_3.json", defaultID, true);
verifyIdentifier("publication_4.json", defaultID); verifyIdentifier("publication_4.json", defaultID, true);
verifyIdentifier("publication_5.json", defaultID); verifyIdentifier("publication_5.json", defaultID, true);
} }
protected void verifyIdentifier(String filename, String expectedID) throws IOException { @Test
public void testCreateIdentifierForPublicationNoHash() throws IOException {
verifyIdentifier("publication_doi1.json", "50|doi_________::10.1016/j.cmet.2011.03.013", false);
verifyIdentifier("publication_doi2.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false);
verifyIdentifier("publication_pmc1.json", "50|pmc_________::21459329", false);
verifyIdentifier(
"publication_urn1.json", "50|urn_________::urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2", false);
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
verifyIdentifier("publication_3.json", defaultID, false);
verifyIdentifier("publication_4.json", defaultID, false);
verifyIdentifier("publication_5.json", defaultID, false);
}
protected void verifyIdentifier(String filename, String expectedID, boolean md5) throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream(filename)); final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class); final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
String id = IdentifierFactory.createIdentifier(pub); String id = IdentifierFactory.createIdentifier(pub, md5);
assertNotNull(id); assertNotNull(id);
assertEquals(expectedID, id); assertEquals(expectedID, id);

View File

@ -26,6 +26,8 @@ public abstract class AbstractMdRecordToOafMapper {
private final boolean invisible; private final boolean invisible;
private final boolean shouldHashId;
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/"; protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/";
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
@ -50,9 +52,11 @@ public abstract class AbstractMdRecordToOafMapper {
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
"main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible) { protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
final boolean shouldHashId) {
this.vocs = vocs; this.vocs = vocs;
this.invisible = invisible; this.invisible = invisible;
this.shouldHashId = shouldHashId;
} }
public List<Oaf> processMdRecord(final String xml) { public List<Oaf> processMdRecord(final String xml) {
@ -137,7 +141,7 @@ public abstract class AbstractMdRecordToOafMapper {
final long lastUpdateTimestamp) { final long lastUpdateTimestamp) {
final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
final String id = IdentifierFactory.createIdentifier(entity); final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
if (!id.equals(entity.getId())) { if (!id.equals(entity.getId())) {
entity.getOriginalId().add(entity.getId()); entity.getOriginalId().add(entity.getId());
entity.setId(id); entity.setId(id);

View File

@ -64,13 +64,19 @@ public class GenerateEntitiesApplication {
final String isLookupUrl = parser.get("isLookupUrl"); final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl); log.info("isLookupUrl: {}", isLookupUrl);
final boolean shouldHashId = Optional
.ofNullable(parser.get("shouldHashId"))
.map(Boolean::valueOf)
.orElse(true);
log.info("shouldHashId: {}", shouldHashId);
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl); final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService); final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> { runWithSparkSession(conf, isSparkSessionManaged, spark -> {
HdfsSupport.remove(targetPath, spark.sparkContext().hadoopConfiguration()); HdfsSupport.remove(targetPath, spark.sparkContext().hadoopConfiguration());
generateEntities(spark, vocs, sourcePaths, targetPath); generateEntities(spark, vocs, sourcePaths, targetPath, shouldHashId);
}); });
} }
@ -78,7 +84,8 @@ public class GenerateEntitiesApplication {
final SparkSession spark, final SparkSession spark,
final VocabularyGroup vocs, final VocabularyGroup vocs,
final String sourcePaths, final String sourcePaths,
final String targetPath) { final String targetPath,
final boolean shouldHashId) {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final List<String> existingSourcePaths = Arrays final List<String> existingSourcePaths = Arrays
@ -97,7 +104,7 @@ public class GenerateEntitiesApplication {
sc sc
.sequenceFile(sp, Text.class, Text.class) .sequenceFile(sp, Text.class, Text.class)
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) .map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
.map(k -> convertToListOaf(k._1(), k._2(), vocs)) .map(k -> convertToListOaf(k._1(), k._2(), shouldHashId, vocs))
.filter(Objects::nonNull) .filter(Objects::nonNull)
.flatMap(list -> list.iterator())); .flatMap(list -> list.iterator()));
} }
@ -113,20 +120,21 @@ public class GenerateEntitiesApplication {
private static List<Oaf> convertToListOaf( private static List<Oaf> convertToListOaf(
final String id, final String id,
final String s, final String s,
final boolean shouldHashId,
final VocabularyGroup vocs) { final VocabularyGroup vocs) {
final String type = StringUtils.substringAfter(id, ":"); final String type = StringUtils.substringAfter(id, ":");
switch (type.toLowerCase()) { switch (type.toLowerCase()) {
case "oaf-store-cleaned": case "oaf-store-cleaned":
case "oaf-store-claim": case "oaf-store-claim":
return new OafToOafMapper(vocs, false).processMdRecord(s); return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
case "odf-store-cleaned": case "odf-store-cleaned":
case "odf-store-claim": case "odf-store-claim":
return new OdfToOafMapper(vocs, false).processMdRecord(s); return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
case "oaf-store-intersection": case "oaf-store-intersection":
return new OafToOafMapper(vocs, true).processMdRecord(s); return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
case "odf-store-intersection": case "odf-store-intersection":
return new OdfToOafMapper(vocs, true).processMdRecord(s); return new OdfToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
case "datasource": case "datasource":
return Arrays.asList(convertFromJson(s, Datasource.class)); return Arrays.asList(convertFromJson(s, Datasource.class));
case "organization": case "organization":

View File

@ -22,8 +22,8 @@ import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
public class OafToOafMapper extends AbstractMdRecordToOafMapper { public class OafToOafMapper extends AbstractMdRecordToOafMapper {
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible) { public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
super(vocs, invisible); super(vocs, invisible, shouldHashId);
} }
@Override @Override

View File

@ -19,8 +19,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/"; public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible) { public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
super(vocs, invisible); super(vocs, invisible, shouldHashId);
} }
@Override @Override

View File

@ -77,7 +77,7 @@ public class GenerateEntitiesApplicationTest {
protected <T extends Result> Result getResult(String xmlFileName, Class<T> clazz) throws IOException { protected <T extends Result> Result getResult(String xmlFileName, Class<T> clazz) throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName)); final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName));
return new OdfToOafMapper(vocs, false) return new OdfToOafMapper(vocs, false, true)
.processMdRecord(xml) .processMdRecord(xml)
.stream() .stream()
.filter(s -> clazz.isAssignableFrom(s.getClass())) .filter(s -> clazz.isAssignableFrom(s.getClass()))

View File

@ -52,7 +52,7 @@ public class MappersTest {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
final List<Oaf> list = new OafToOafMapper(vocs, false).processMdRecord(xml); final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
assertEquals(3, list.size()); assertEquals(3, list.size());
assertTrue(list.get(0) instanceof Publication); assertTrue(list.get(0) instanceof Publication);
@ -131,7 +131,7 @@ public class MappersTest {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
final List<Oaf> list = new OafToOafMapper(vocs, true).processMdRecord(xml); final List<Oaf> list = new OafToOafMapper(vocs, true, true).processMdRecord(xml);
assertTrue(list.size() > 0); assertTrue(list.size() > 0);
assertTrue(list.get(0) instanceof Publication); assertTrue(list.get(0) instanceof Publication);
@ -146,7 +146,7 @@ public class MappersTest {
void testDataset() throws IOException { void testDataset() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
assertEquals(3, list.size()); assertEquals(3, list.size());
assertTrue(list.get(0) instanceof Dataset); assertTrue(list.get(0) instanceof Dataset);
@ -240,7 +240,7 @@ public class MappersTest {
void testSoftware() throws IOException { void testSoftware() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
assertEquals(1, list.size()); assertEquals(1, list.size());
assertTrue(list.get(0) instanceof Software); assertTrue(list.get(0) instanceof Software);
@ -259,7 +259,7 @@ public class MappersTest {
void testDataset_2() throws IOException { void testDataset_2() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset_2.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset_2.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************"); System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println(new ObjectMapper().writeValueAsString(list));
@ -269,7 +269,7 @@ public class MappersTest {
@Test @Test
void testClaimDedup() throws IOException { void testClaimDedup() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_dedup.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_dedup.xml"));
final List<Oaf> list = new OafToOafMapper(vocs, false).processMdRecord(xml); final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************"); System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println(new ObjectMapper().writeValueAsString(list));
@ -279,7 +279,7 @@ public class MappersTest {
@Test @Test
void testNakala() throws IOException { void testNakala() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_nakala.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_nakala.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************"); System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println(new ObjectMapper().writeValueAsString(list));
@ -303,7 +303,7 @@ public class MappersTest {
@Test @Test
void testClaimFromCrossref() throws IOException { void testClaimFromCrossref() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml"));
final List<Oaf> list = new OafToOafMapper(vocs, false).processMdRecord(xml); final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************"); System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println(new ObjectMapper().writeValueAsString(list));
@ -319,7 +319,7 @@ public class MappersTest {
@Test @Test
void testODFRecord() throws IOException { void testODFRecord() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_record.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_record.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************"); System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************"); System.out.println("***************");
@ -333,7 +333,7 @@ public class MappersTest {
@Test @Test
void testTextGrid() throws IOException { void testTextGrid() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("textgrid.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("textgrid.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************"); System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println(new ObjectMapper().writeValueAsString(list));