From e4b105cece1a442958dd051ebf878fc3333b5e5a Mon Sep 17 00:00:00 2001 From: sandro Date: Mon, 20 Apr 2020 18:10:07 +0200 Subject: [PATCH] improved crossref mapping --- .../doiboost/crossref/Crossref2Oaf.scala | 156 +++++++++++++----- .../dhp/doiboost/orcid/oozie_app/workflow.xml | 2 +- .../eu/dnetlib/doiboost/DoiBoostTest.java | 97 +++++++---- .../resources/eu/dnetlib/doiboost/pc.json | 26 +++ 4 files changed, 206 insertions(+), 75 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 8053a7cbb..d3e334000 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -2,8 +2,10 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.commons.lang.StringUtils import org.json4s import org.json4s.DefaultFormats +import org.json4s.JsonAST._ import org.json4s.jackson.JsonMethods._ import org.slf4j.Logger @@ -11,7 +13,7 @@ import scala.collection.JavaConverters._ class Crossref2Oaf { -//STATIC STRING + //STATIC STRING val MAG = "MAG" val ORCID = "ORCID" val CROSSREF = "Crossref" @@ -105,59 +107,123 @@ class Crossref2Oaf { // Add DataInfo result.setDataInfo(generateDataInfo()) - result.setLastupdatetimestamp((json \"indexed" \"timestamp").extract[Long]) - result.setDateofcollection((json \"indexed" \"date-time").extract[String]) + result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long]) + result.setDateofcollection((json \ "indexed" \ "date-time").extract[String]) - //result.setCollectedfrom() + result.setCollectedfrom(List(createCollectedFrom()).asJava) + // Publisher ( Name of work's publisher mapped into Result/Publisher) + val publisher = (json \ "publisher").extract[String] + result.setPublisher(asField(publisher)) + + // TITLE + val mainTitles = for {JString(title) <- json \ "title"} yield createSP(title, "main title", "dnet:dataCite_title") + val originalTitles = for {JString(title) <- json \ "original-title"} yield createSP(title, "alternative title", "dnet:dataCite_title") + val shortTitles = for {JString(title) <- json \ "short-title"} yield createSP(title, "alternative title", "dnet:dataCite_title") + result.setTitle((mainTitles ::: originalTitles ::: shortTitles).asJava) + + // DESCRIPTION + val descriptionList = for {JString(description) <- json \ "abstract"} yield asField(description) + result.setDescription(descriptionList.asJava) + // Source + val sourceList = for {JString(source) <- json \ "source"} yield asField(source) + + result.setSource(sourceList.asJava) + + + //RELEVANT DATE Mapping + val createdDate =generateDate((json \ "created" \"date-time").extract[String],(json \ "created"\"date-parts").extract[List[List[Int]]],"created", "dnet:dataCite_date" ) + val postedDate =generateDate((json \ "posted" \"date-time").extractOrElse[String](null),(json \ "posted"\"date-parts").extract[List[List[Int]]],"available", "dnet:dataCite_date" ) + val acceptedDate =generateDate((json \ "accepted" \"date-time").extractOrElse[String](null),(json \ "accepted"\"date-parts").extract[List[List[Int]]],"accepted", "dnet:dataCite_date" ) + val publishedPrintDate =generateDate((json \ "published-print" \"date-time").extractOrElse[String](null),(json \ "published-print"\"date-parts").extract[List[List[Int]]],"published-print", "dnet:dataCite_date" ) + val publishedOnlineDate =generateDate((json \ "published-online" \"date-time").extractOrElse[String](null),(json \ "published-online"\"date-parts").extract[List[List[Int]]],"published-online", "dnet:dataCite_date" ) + + result.setRelevantdate(List(createdDate ,postedDate, acceptedDate,publishedOnlineDate, publishedPrintDate).asJava) result } - def generateIdentifier(oaf: Result, doi:String): String = { - val id = DHPUtils.md5(doi.toLowerCase) - if (oaf.isInstanceOf[Dataset]) - return s"60|${doiBoostNSPREFIX}${SEPARATOR}${id}" - s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}" - } - - def generateDataInfo(): DataInfo = { - val di =new DataInfo - di.setDeletedbyinference(false) - di.setInferred(false) - di.setInvisible(false) - di.setTrust("0.9") - di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions")) - di - } + def generateDate(dt: String, datePart: List[List[Int]], classId: String, schemeId: String): StructuredProperty = { + if (StringUtils.isNotBlank(dt)) + return createSP(dt, classId, schemeId) - def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { - val sp = new StructuredProperty - sp.setQualifier(createQualifier(classId, schemeId)) - sp.setValue(value) - sp - - } - - def createQualifier(cls:String, sch:String):Qualifier = { - val q = new Qualifier - q.setClassid(cls) - q.setClassname(cls) - q.setSchemeid(sch) - q.setSchemename(sch) - q - } - - - def generateItemFromType(objectType: String, objectSubType: String): Result = { - if (mappingCrossrefType.contains(objectType)) { - if (mappingCrossrefType(objectType).equalsIgnoreCase("publication")) - return new Publication() - if (mappingCrossrefType(objectType).equalsIgnoreCase("dataset")) - return new Dataset() + if (datePart != null && datePart.size == 1) { + val res = datePart.head + if (res.size == 3) { + val dp = f"${res.head}-${res(1)}%02d-${res(2)}%02d" + println(dp) + if (dp.length == 10) { + return createSP(dp, classId, schemeId) + } + } } null } -} + + def generateIdentifier(oaf: Result, doi: String): String = { + val id = DHPUtils.md5(doi.toLowerCase) + if (oaf.isInstanceOf[Dataset]) + return s"60|${doiBoostNSPREFIX}${SEPARATOR}${id}" + s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}" + } + + def asField[T](value: T): Field[T] = { + val tmp = new Field[T] + tmp.setValue(value) + tmp + + + } + + + def generateDataInfo(): DataInfo = { + val di = new DataInfo + di.setDeletedbyinference(false) + di.setInferred(false) + di.setInvisible(false) + di.setTrust("0.9") + di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions")) + di + } + + + def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { + val sp = new StructuredProperty + sp.setQualifier(createQualifier(classId, schemeId)) + sp.setValue(value) + sp + + } + + def createCollectedFrom(): KeyValue = { + + val cf = new KeyValue + cf.setValue(CROSSREF) + cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5("crossref")) + cf + + } + + def createQualifier(cls: String, sch: String): Qualifier = { + val q = new Qualifier + q.setClassid(cls) + q.setClassname(cls) + q.setSchemeid(sch) + q.setSchemename(sch) + q + } + + + def generateItemFromType(objectType: String, objectSubType: String): Result = { + if (mappingCrossrefType.contains(objectType)) { + if (mappingCrossrefType(objectType).equalsIgnoreCase("publication")) + return new Publication() + if (mappingCrossrefType(objectType).equalsIgnoreCase("dataset")) + return new Dataset() + } + null + } + + } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml index a52a56634..82096e4ca 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + workingPath diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java index 9041d4488..9f3326412 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java @@ -1,68 +1,107 @@ package eu.dnetlib.doiboost; +import static org.junit.jupiter.api.Assertions.*; + import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; import com.jayway.jsonpath.JsonPath; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.doiboost.crossref.Crossref2Oaf; import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF; +import java.io.IOException; +import java.util.List; import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.platform.commons.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.List; - public class DoiBoostTest { Logger logger = LoggerFactory.getLogger(DoiBoostTest.class); - public void test() throws Exception { - //SparkDownloadContentFromCrossref.main(null); - //CrossrefImporter.main("-n file:///tmp -t file:///tmp/p.seq -ts 1586110000749".split(" ")); - SparkMapDumpIntoOAF.main("-m local[*] -s file:///data/doiboost/crossref_dump.seq".split(" ")); + // SparkDownloadContentFromCrossref.main(null); + // CrossrefImporter.main("-n file:///tmp -t file:///tmp/p.seq -ts 1586110000749".split(" + // ")); + SparkMapDumpIntoOAF.main( + "-m local[*] -s file:///data/doiboost/crossref_dump.seq".split(" ")); } - - @Test public void testConvertCrossRef2Oaf() throws IOException { final String json = IOUtils.toString(getClass().getResourceAsStream("pc.json")); ObjectMapper mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT); - Assertions.assertNotNull(json); - Assertions.assertFalse(StringUtils.isBlank(json)); - - - + assertNotNull(json); + assertFalse(StringUtils.isBlank(json)); Crossref2Oaf cf = new Crossref2Oaf(); final Result result = cf.convert(json, logger); - Assertions.assertNotNull(result); + assertNotNull(result); + + assertNotNull(result.getDataInfo(), "Datainfo test not null Failed"); + assertNotNull( + result.getDataInfo().getProvenanceaction(), + "DataInfo/Provenance test not null Failed"); + assertFalse( + StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getClassid()), + "DataInfo/Provenance/classId test not null Failed"); + assertFalse( + StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getClassname()), + "DataInfo/Provenance/className test not null Failed"); + assertFalse( + StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getSchemeid()), + "DataInfo/Provenance/SchemeId test not null Failed"); + assertFalse( + StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getSchemename()), + "DataInfo/Provenance/SchemeName test not null Failed"); + + assertNotNull(result.getCollectedfrom(), "CollectedFrom test not null Failed"); + assertTrue(result.getCollectedfrom().size() > 0); + assertTrue( + result.getCollectedfrom().stream() + .anyMatch( + c -> + c.getKey() + .equalsIgnoreCase( + "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"))); + assertTrue( + result.getCollectedfrom().stream() + .anyMatch(c -> c.getValue().equalsIgnoreCase("crossref"))); + + assertTrue( + result.getRelevantdate().stream() + .anyMatch(d -> d.getQualifier().getClassid().equalsIgnoreCase("created"))); + assertTrue( + result.getRelevantdate().stream() + .anyMatch( + d -> d.getQualifier().getClassid().equalsIgnoreCase("available"))); + assertTrue( + result.getRelevantdate().stream() + .anyMatch(d -> d.getQualifier().getClassid().equalsIgnoreCase("accepted"))); + assertTrue( + result.getRelevantdate().stream() + .anyMatch( + d -> + d.getQualifier() + .getClassid() + .equalsIgnoreCase("published-online"))); + assertTrue( + result.getRelevantdate().stream() + .anyMatch( + d -> + d.getQualifier() + .getClassid() + .equalsIgnoreCase("published-print"))); logger.info(mapper.writeValueAsString(result)); - } - - - - @Test public void testPath() throws Exception { final String json = IOUtils.toString(getClass().getResourceAsStream("response.json")); - final List res = JsonPath.read(json, "$.hits.hits[*]._source.blob"); + final List res = JsonPath.read(json, "$.hits.hits[*]._source.blob"); System.out.println(res.size()); - } - - - - - } diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/pc.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/pc.json index c35c97d53..13b2fea84 100644 --- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/pc.json +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/pc.json @@ -112,9 +112,35 @@ ] ] }, + "published-print": { + "timestamp": 1446095513000, + "date-time": "2015-10-29T05:11:53Z", + "date-parts": [ + [ + 2015, + 2, + 29 + ] + ] + }, + "published-online": { + "date-parts": [ + [ + 2015, + 2, + 2 + ] + ] + }, "title": [ "Genetic transformation of micropropagated shoots ofPinus radiataD.Don" ], + "original-title": [ + "OR TITLE" + ], + "short-title": [ + "SHORT TITLE" + ], "group-title": "Plant Biology", "subtype": "preprint" } \ No newline at end of file