forked from antonis.lempesis/dnet-hadoop
GenerateEntitiesApplication can be configured to hash the id value or not
This commit is contained in:
parent
758d27745d
commit
2c407e775e
|
@ -8,7 +8,7 @@ import java.util.Objects;
|
|||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
|
@ -31,17 +31,16 @@ public class IdentifierFactory implements Serializable {
|
|||
"(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)";
|
||||
|
||||
public static final int ID_PREFIX_LEN = 12;
|
||||
public static final String NONE = "none";
|
||||
|
||||
/**
|
||||
* Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id
|
||||
* when no PID is available
|
||||
* @param entity the entity providing PIDs and a default ID.
|
||||
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
|
||||
* @param md5 indicates whether should hash the PID value or not.
|
||||
* @return an identifier from the most relevant PID, entity.id otherwise
|
||||
*/
|
||||
public static <T extends OafEntity> String createIdentifier(T entity) {
|
||||
|
||||
public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) {
|
||||
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
|
||||
return entity.getId();
|
||||
}
|
||||
|
@ -69,15 +68,27 @@ public class IdentifierFactory implements Serializable {
|
|||
.stream()
|
||||
.sorted(new PidValueComparator())
|
||||
.findFirst()
|
||||
.map(s -> idFromPid(entity, s))
|
||||
.map(s -> idFromPid(entity, s, md5))
|
||||
.orElseGet(entity::getId))
|
||||
.orElseGet(entity::getId))
|
||||
.orElseGet(entity::getId);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
|
||||
*/
|
||||
public static <T extends OafEntity> String createIdentifier(T entity) {
|
||||
|
||||
return createIdentifier(entity, true);
|
||||
}
|
||||
|
||||
protected static boolean pidFilter(StructuredProperty s) {
|
||||
if (Objects.isNull(s.getQualifier()) ||
|
||||
StringUtils.isBlank(StringUtils.trim(s.getValue()))) {
|
||||
StringUtils.isBlank(s.getValue()) ||
|
||||
StringUtils.isBlank(s.getValue().replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
|
||||
return false;
|
||||
}
|
||||
if (CleaningFunctions.PID_BLACKLIST.contains(StringUtils.trim(s.getValue().toLowerCase()))) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
|
@ -93,21 +104,14 @@ public class IdentifierFactory implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
private static String verifyIdSyntax(String s) {
|
||||
if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
|
||||
throw new RuntimeException(String.format("malformed id: '%s'", s));
|
||||
} else {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s) {
|
||||
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
|
||||
final String value = CleaningFunctions.normalizePidValue(s).getValue();
|
||||
return new StringBuilder()
|
||||
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
|
||||
.append(ID_PREFIX_SEPARATOR)
|
||||
.append(createPrefix(s.getQualifier().getClassid()))
|
||||
.append(ID_SEPARATOR)
|
||||
.append(DHPUtils.md5(CleaningFunctions.normalizePidValue(s).getValue()))
|
||||
.append(md5 ? DHPUtils.md5(value) : value)
|
||||
.toString();
|
||||
}
|
||||
|
||||
|
|
|
@ -22,24 +22,41 @@ public class IdentifierFactoryTest {
|
|||
@Test
|
||||
public void testCreateIdentifierForPublication() throws IOException {
|
||||
|
||||
verifyIdentifier("publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
|
||||
verifyIdentifier("publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013"));
|
||||
verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
|
||||
verifyIdentifier(
|
||||
"publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"), true);
|
||||
verifyIdentifier(
|
||||
"publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013"), true);
|
||||
verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329"), true);
|
||||
verifyIdentifier(
|
||||
"publication_urn1.json",
|
||||
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
|
||||
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"), true);
|
||||
|
||||
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
||||
verifyIdentifier("publication_3.json", defaultID);
|
||||
verifyIdentifier("publication_4.json", defaultID);
|
||||
verifyIdentifier("publication_5.json", defaultID);
|
||||
verifyIdentifier("publication_3.json", defaultID, true);
|
||||
verifyIdentifier("publication_4.json", defaultID, true);
|
||||
verifyIdentifier("publication_5.json", defaultID, true);
|
||||
}
|
||||
|
||||
protected void verifyIdentifier(String filename, String expectedID) throws IOException {
|
||||
@Test
|
||||
public void testCreateIdentifierForPublicationNoHash() throws IOException {
|
||||
|
||||
verifyIdentifier("publication_doi1.json", "50|doi_________::10.1016/j.cmet.2011.03.013", false);
|
||||
verifyIdentifier("publication_doi2.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false);
|
||||
verifyIdentifier("publication_pmc1.json", "50|pmc_________::21459329", false);
|
||||
verifyIdentifier(
|
||||
"publication_urn1.json", "50|urn_________::urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2", false);
|
||||
|
||||
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
||||
verifyIdentifier("publication_3.json", defaultID, false);
|
||||
verifyIdentifier("publication_4.json", defaultID, false);
|
||||
verifyIdentifier("publication_5.json", defaultID, false);
|
||||
}
|
||||
|
||||
protected void verifyIdentifier(String filename, String expectedID, boolean md5) throws IOException {
|
||||
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
||||
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
|
||||
|
||||
String id = IdentifierFactory.createIdentifier(pub);
|
||||
String id = IdentifierFactory.createIdentifier(pub, md5);
|
||||
|
||||
assertNotNull(id);
|
||||
assertEquals(expectedID, id);
|
||||
|
|
|
@ -26,6 +26,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
|
||||
private final boolean invisible;
|
||||
|
||||
private final boolean shouldHashId;
|
||||
|
||||
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
|
||||
protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/";
|
||||
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
||||
|
@ -50,9 +52,11 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
|
||||
"main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
||||
|
||||
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
|
||||
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
|
||||
final boolean shouldHashId) {
|
||||
this.vocs = vocs;
|
||||
this.invisible = invisible;
|
||||
this.shouldHashId = shouldHashId;
|
||||
}
|
||||
|
||||
public List<Oaf> processMdRecord(final String xml) {
|
||||
|
@ -137,7 +141,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
final long lastUpdateTimestamp) {
|
||||
|
||||
final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||
final String id = IdentifierFactory.createIdentifier(entity);
|
||||
final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
|
||||
if (!id.equals(entity.getId())) {
|
||||
entity.getOriginalId().add(entity.getId());
|
||||
entity.setId(id);
|
||||
|
|
|
@ -64,13 +64,19 @@ public class GenerateEntitiesApplication {
|
|||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl: {}", isLookupUrl);
|
||||
|
||||
final boolean shouldHashId = Optional
|
||||
.ofNullable(parser.get("shouldHashId"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(true);
|
||||
log.info("shouldHashId: {}", shouldHashId);
|
||||
|
||||
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
HdfsSupport.remove(targetPath, spark.sparkContext().hadoopConfiguration());
|
||||
generateEntities(spark, vocs, sourcePaths, targetPath);
|
||||
generateEntities(spark, vocs, sourcePaths, targetPath, shouldHashId);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -78,7 +84,8 @@ public class GenerateEntitiesApplication {
|
|||
final SparkSession spark,
|
||||
final VocabularyGroup vocs,
|
||||
final String sourcePaths,
|
||||
final String targetPath) {
|
||||
final String targetPath,
|
||||
final boolean shouldHashId) {
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
final List<String> existingSourcePaths = Arrays
|
||||
|
@ -97,7 +104,7 @@ public class GenerateEntitiesApplication {
|
|||
sc
|
||||
.sequenceFile(sp, Text.class, Text.class)
|
||||
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
|
||||
.map(k -> convertToListOaf(k._1(), k._2(), vocs))
|
||||
.map(k -> convertToListOaf(k._1(), k._2(), shouldHashId, vocs))
|
||||
.filter(Objects::nonNull)
|
||||
.flatMap(list -> list.iterator()));
|
||||
}
|
||||
|
@ -113,20 +120,21 @@ public class GenerateEntitiesApplication {
|
|||
private static List<Oaf> convertToListOaf(
|
||||
final String id,
|
||||
final String s,
|
||||
final boolean shouldHashId,
|
||||
final VocabularyGroup vocs) {
|
||||
final String type = StringUtils.substringAfter(id, ":");
|
||||
|
||||
switch (type.toLowerCase()) {
|
||||
case "oaf-store-cleaned":
|
||||
case "oaf-store-claim":
|
||||
return new OafToOafMapper(vocs, false).processMdRecord(s);
|
||||
return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
|
||||
case "odf-store-cleaned":
|
||||
case "odf-store-claim":
|
||||
return new OdfToOafMapper(vocs, false).processMdRecord(s);
|
||||
return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
|
||||
case "oaf-store-intersection":
|
||||
return new OafToOafMapper(vocs, true).processMdRecord(s);
|
||||
return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
|
||||
case "odf-store-intersection":
|
||||
return new OdfToOafMapper(vocs, true).processMdRecord(s);
|
||||
return new OdfToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
|
||||
case "datasource":
|
||||
return Arrays.asList(convertFromJson(s, Datasource.class));
|
||||
case "organization":
|
||||
|
|
|
@ -22,8 +22,8 @@ import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
|||
|
||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
|
||||
super(vocs, invisible);
|
||||
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
|
||||
super(vocs, invisible, shouldHashId);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,8 +19,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
|
||||
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
|
||||
|
||||
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
|
||||
super(vocs, invisible);
|
||||
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
|
||||
super(vocs, invisible, shouldHashId);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -77,7 +77,7 @@ public class GenerateEntitiesApplicationTest {
|
|||
|
||||
protected <T extends Result> Result getResult(String xmlFileName, Class<T> clazz) throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName));
|
||||
return new OdfToOafMapper(vocs, false)
|
||||
return new OdfToOafMapper(vocs, false, true)
|
||||
.processMdRecord(xml)
|
||||
.stream()
|
||||
.filter(s -> clazz.isAssignableFrom(s.getClass()))
|
||||
|
|
|
@ -52,7 +52,7 @@ public class MappersTest {
|
|||
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
|
||||
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false).processMdRecord(xml);
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
assertEquals(3, list.size());
|
||||
assertTrue(list.get(0) instanceof Publication);
|
||||
|
@ -131,7 +131,7 @@ public class MappersTest {
|
|||
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
|
||||
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, true).processMdRecord(xml);
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, true, true).processMdRecord(xml);
|
||||
|
||||
assertTrue(list.size() > 0);
|
||||
assertTrue(list.get(0) instanceof Publication);
|
||||
|
@ -146,7 +146,7 @@ public class MappersTest {
|
|||
void testDataset() throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml"));
|
||||
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
assertEquals(3, list.size());
|
||||
assertTrue(list.get(0) instanceof Dataset);
|
||||
|
@ -240,7 +240,7 @@ public class MappersTest {
|
|||
void testSoftware() throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml"));
|
||||
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
assertEquals(1, list.size());
|
||||
assertTrue(list.get(0) instanceof Software);
|
||||
|
@ -259,7 +259,7 @@ public class MappersTest {
|
|||
void testDataset_2() throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset_2.xml"));
|
||||
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
|
@ -269,7 +269,7 @@ public class MappersTest {
|
|||
@Test
|
||||
void testClaimDedup() throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_dedup.xml"));
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false).processMdRecord(xml);
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
|
@ -279,7 +279,7 @@ public class MappersTest {
|
|||
@Test
|
||||
void testNakala() throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_nakala.xml"));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
|
@ -303,7 +303,7 @@ public class MappersTest {
|
|||
@Test
|
||||
void testClaimFromCrossref() throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml"));
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false).processMdRecord(xml);
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
|
@ -319,7 +319,7 @@ public class MappersTest {
|
|||
@Test
|
||||
void testODFRecord() throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_record.xml"));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
System.out.println("***************");
|
||||
|
@ -333,7 +333,7 @@ public class MappersTest {
|
|||
@Test
|
||||
void testTextGrid() throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("textgrid.xml"));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
|
|
Loading…
Reference in New Issue