diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/vocabulary/VocabularyTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/vocabulary/VocabularyTest.java new file mode 100644 index 000000000..6529d43da --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/vocabulary/VocabularyTest.java @@ -0,0 +1,84 @@ +package eu.dnetlib.dhp.common.vocabulary; + +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +import static org.mockito.Mockito.lenient; + + +@ExtendWith(MockitoExtension.class) +public class VocabularyTest { + + + @Mock + protected ISLookUpService isLookUpService; + + protected VocabularyGroup vocabularies; + + @BeforeEach + public void setUpVocabulary() throws ISLookUpException, IOException { + + lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); + + lenient() + .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) + .thenReturn(synonyms()); + vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService); + } + + private static List vocs() throws IOException { + return IOUtils + .readLines( + Objects + .requireNonNull( + VocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt"))); + } + + private static List synonyms() throws IOException { + return IOUtils + .readLines( + Objects + .requireNonNull( + VocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt"))); + } + + + @Test + void testVocabularyMatch () throws Exception{ + final String s= IOUtils.toString(this.getClass().getResourceAsStream("terms")); + + for (String s1 : s.split("\n")) { + + final Qualifier t1 = vocabularies.getSynonymAsQualifier("dnet:publication_resource", s1); + + if (t1 == null) { + System.err.println(s1+ " Missing"); + } + else { + System.out.println("syn=" + s1 + " term = " + t1.getClassid()); + + + System.out.println(vocabularies.getSynonymAsQualifier("dnet:result_typologies", t1.getClassid()).getClassname()); + } + } + + + + + + } +} diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/common/vocabulary/terms b/dhp-common/src/test/resources/eu/dnetlib/dhp/common/vocabulary/terms new file mode 100644 index 000000000..abeed4cc8 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/common/vocabulary/terms @@ -0,0 +1,34 @@ +grant +book +report-series +report-component +book-series +peer-review +component +report +book-track +database +standard +journal-volume +proceedings-series +preprint +book-section +letter +reference-book +edited-book +journal-issue +dataset +reference-entry +dissertation +book-chapter +book-part +journal +book-set +working_paper +dissertation +other +proceedings-article +journal-article +other +proceedings +monograph \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/CrossrefUtility.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/CrossrefUtility.scala new file mode 100644 index 000000000..28ea64c9b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/CrossrefUtility.scala @@ -0,0 +1,357 @@ +package eu.dnetlib.dhp.crossref + +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.schema.common.ModelConstants +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._ +import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} +import eu.dnetlib.dhp.schema.oaf._ +import org.apache.commons.lang.StringUtils +import org.json4s +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString, JValue} +import org.json4s.jackson.JsonMethods.parse + +import scala.collection.JavaConverters._ + + +case class CrossrefDT(doi: String, json: String, timestamp: Long) {} +object CrossrefUtility { + val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)" + val DOI_PREFIX = "10." + val CROSSREF_COLLECTED_FROM = keyValue(ModelConstants.CROSSREF_ID, ModelConstants.CROSSREF_NAME) + + def normalizeDoi(input: String): String = { + if (input == null) + return null + val replaced = input + .replaceAll("(?:\\n|\\r|\\t|\\s)", "") + .toLowerCase + .replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX) + if (replaced == null || replaced.trim.isEmpty) + return null + if (replaced.indexOf("10.") < 0) + return null + val ret = replaced.substring(replaced.indexOf("10.")) + if (!ret.startsWith(DOI_PREFIX)) + return null + ret + } + + + def extractDate(dt: String, datePart: List[List[Int]]): String = { + if (StringUtils.isNotBlank(dt)) + return GraphCleaningFunctions.cleanDate(dt) + if (datePart != null && datePart.size == 1) { + val res = datePart.head + if (res.size == 3) { + val dp = f"${res.head}-${res(1)}%02d-${res(2)}%02d" + if (dp.length == 10) { + return GraphCleaningFunctions.cleanDate(dp) + } + } else if (res.size == 2) { + val dp = f"${res.head}-${res(1)}%02d-01" + return GraphCleaningFunctions.cleanDate(dp) + } else if (res.size == 1) { + return GraphCleaningFunctions.cleanDate(s"${res.head}-01-01") + } + } + null + + } + + private def generateDate( + dt: String, + datePart: List[List[Int]], + classId: String, + schemeId: String + ): StructuredProperty = { + val dp = extractDate(dt, datePart) + if (StringUtils.isNotBlank(dp)) + structuredProperty(dp, classId, classId,schemeId) + else + null + } + + + private def generateItemFromType(objectType: String, vocabularies:VocabularyGroup): (Result, String) = { + val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, objectType) + if (term != null) { + val resourceType = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, term.getClassid).getClassname + + resourceType match { + case "publication" =>(new Publication, resourceType) + case "dataset" =>(new Dataset, resourceType) + case "software" => (new Software, resourceType) + case "otherresearchproduct" =>(new OtherResearchProduct, resourceType) + } + } else + null + } + + + def convert(input: String, vocabularies:VocabularyGroup): List[Oaf] = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: json4s.JValue = parse(input) + + var resultList: List[Oaf] = List() + + val objectType = (json \ "type").extractOrElse[String](null) + if (objectType == null) + return resultList + + val resultWithType = generateItemFromType(objectType, vocabularies) + if (resultWithType == null) + return List() + + val result = resultWithType._1 + val cOBJCategory = resultWithType._2 + mappingResult(result, json, cOBJCategory) + if (result == null || result.getId == null) + return List() + + val funderList: List[mappingFunder] = + (json \ "funder").extractOrElse[List[mappingFunder]](List()) + + if (funderList.nonEmpty) { + resultList = resultList ::: mappingFunderToRelations( + funderList, + result.getId, + createCrossrefCollectedFrom(), + result.getDataInfo, + result.getLastupdatetimestamp + ) + } + + result match { + case publication: Publication => convertPublication(publication, json, cOBJCategory) + case dataset: Dataset => convertDataset(dataset) + } + + resultList = resultList ::: List(result) + resultList + } + + + def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + + //MAPPING Crossref DOI into PID + val doi: String = normalizeDoi((json \ "DOI").extract[String]) + + result.setPid( + List( + structuredProperty(doi, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES) + ).asJava) + + //MAPPING Crossref DOI into OriginalId + //and Other Original Identifier of dataset like clinical-trial-number + val clinicalTrialNumbers: List[String] = for (JString(ctr) <- json \ "clinical-trial-number") yield ctr + val alternativeIds: List[String] = for (JString(ids) <- json \ "alternative-id") yield ids + val tmp = clinicalTrialNumbers ::: alternativeIds ::: List(doi) + + + result.setOriginalId(tmp.filter(id => id != null).asJava) + + // Add DataInfo + result.setDataInfo(dataInfo(false, false,0.9F,null, false,ModelConstants.REPOSITORY_PROVENANCE_ACTIONS)) + + result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long]) + result.setDateofcollection((json \ "indexed" \ "date-time").extract[String]) + + result.setCollectedfrom(List(CROSSREF_COLLECTED_FROM).asJava) + + // Publisher ( Name of work's publisher mapped into Result/Publisher) + val publisher = (json \ "publisher").extractOrElse[String](null) + if (publisher != null && publisher.nonEmpty) + result.setPublisher(new Publisher(publisher)) + + // TITLE + val mainTitles = + for {JString(title) <- json \ "title" if title.nonEmpty} + yield + structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER) + val originalTitles = for { + JString(title) <- json \ "original-title" if title.nonEmpty + } yield structuredProperty(title, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER) + val shortTitles = for { + JString(title) <- json \ "short-title" if title.nonEmpty + } yield structuredProperty(title, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER) + val subtitles = + for {JString(title) <- json \ "subtitle" if title.nonEmpty} + yield structuredProperty(title, ModelConstants.SUBTITLE_QUALIFIER) + result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava) + + // DESCRIPTION + val descriptionList = + for {JString(description) <- json \ "abstract"} yield description + result.setDescription(descriptionList.asJava) + + // Source + val sourceList = for { + JString(source) <- json \ "source" if source != null && source.nonEmpty + } yield source + result.setSource(sourceList.asJava) + + //RELEVANT DATE Mapping + val createdDate = generateDate( + (json \ "created" \ "date-time").extract[String], + (json \ "created" \ "date-parts").extract[List[List[Int]]], + "created", + ModelConstants.DNET_DATACITE_DATE + ) + val postedDate = generateDate( + (json \ "posted" \ "date-time").extractOrElse[String](null), + (json \ "posted" \ "date-parts").extract[List[List[Int]]], + "available", + ModelConstants.DNET_DATACITE_DATE + ) + val acceptedDate = generateDate( + (json \ "accepted" \ "date-time").extractOrElse[String](null), + (json \ "accepted" \ "date-parts").extract[List[List[Int]]], + "accepted", + ModelConstants.DNET_DATACITE_DATE + ) + val publishedPrintDate = generateDate( + (json \ "published-print" \ "date-time").extractOrElse[String](null), + (json \ "published-print" \ "date-parts").extract[List[List[Int]]], + "published-print", + ModelConstants.DNET_DATACITE_DATE + ) + val publishedOnlineDate = generateDate( + (json \ "published-online" \ "date-time").extractOrElse[String](null), + (json \ "published-online" \ "date-parts").extract[List[List[Int]]], + "published-online", + ModelConstants.DNET_DATACITE_DATE + ) + + val issuedDate = extractDate( + (json \ "issued" \ "date-time").extractOrElse[String](null), + (json \ "issued" \ "date-parts").extract[List[List[Int]]] + ) + if (StringUtils.isNotBlank(issuedDate)) { + result.setDateofacceptance(issuedDate) + } else { + result.setDateofacceptance(createdDate.getValue) + } + result.setRelevantdate( + List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate) + .filter(p => p != null) + .asJava + ) + + //Mapping Subject + val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List()) + + + + if (subjectList.nonEmpty) { + result.setSubject( + subjectList.map(s => createSubject(s, "keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava + ) + } + + //Mapping Author + val authorList: List[mappingAuthor] = + (json \ "author").extractOrElse[List[mappingAuthor]](List()) + + val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) => + a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first") + ) + + result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) => + generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index) + }.asJava) + + // Mapping instance + val instance = new Instance() + val license = for { + JObject(license) <- json \ "license" + JField("URL", JString(lic)) <- license + JField("content-version", JString(content_version)) <- license + } yield (asField(lic), content_version) + val l = license.filter(d => StringUtils.isNotBlank(d._1.getValue)) + if (l.nonEmpty) { + if (l exists (d => d._2.equals("vor"))) { + for (d <- l) { + if (d._2.equals("vor")) { + instance.setLicense(d._1) + } + } + } else { + instance.setLicense(l.head._1) + } + } + + // Ticket #6281 added pid to Instance + instance.setPid(result.getPid) + + val has_review = json \ "relation" \ "has-review" \ "id" + + if (has_review != JNothing) { + instance.setRefereed( + OafMapperUtils.qualifier( + "0001", + "peerReviewed", + ModelConstants.DNET_REVIEW_LEVELS, + ModelConstants.DNET_REVIEW_LEVELS + ) + ) + } + + instance.setAccessright( + decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue) + ) + instance.setInstancetype( + OafMapperUtils.qualifier( + cobjCategory.substring(0, 4), + cobjCategory.substring(5), + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) + result.setResourcetype( + OafMapperUtils.qualifier( + cobjCategory.substring(0, 4), + cobjCategory.substring(5), + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) + + instance.setCollectedfrom(createCrossrefCollectedFrom()) + if (StringUtils.isNotBlank(issuedDate)) { + instance.setDateofacceptance(asField(issuedDate)) + } else { + instance.setDateofacceptance(asField(createdDate.getValue)) + } + val s: List[String] = List("https://doi.org/" + doi) + // val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct + // if (links.nonEmpty) { + // instance.setUrl(links.asJava) + // } + if (s.nonEmpty) { + instance.setUrl(s.asJava) + } + + result.setInstance(List(instance).asJava) + + //IMPORTANT + //The old method result.setId(generateIdentifier(result, doi)) + //is replaced using IdentifierFactory, but the old identifier + //is preserved among the originalId(s) + val oldId = generateIdentifier(result, doi) + result.setId(oldId) + + val newId = IdentifierFactory.createDOIBoostIdentifier(result) + if (!oldId.equalsIgnoreCase(newId)) { + result.getOriginalId.add(oldId) + } + result.setId(newId) + + if (result.getId == null) + null + else + result + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/GenerateCrossrefDataset.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/GenerateCrossrefDataset.scala new file mode 100644 index 000000000..fd4bcd37d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/GenerateCrossrefDataset.scala @@ -0,0 +1,22 @@ +package eu.dnetlib.dhp.crossref + +import eu.dnetlib.dhp.application.AbstractScalaApplication +import org.slf4j.{Logger, LoggerFactory} + +class GenerateCrossrefDataset (propertyPath: String, args: Array[String], log: Logger) + extends AbstractScalaApplication(propertyPath, args, log: Logger) { + /** Here all the spark applications runs this method + * where the whole logic of the spark node is defined + */ + override def run(): Unit = ??? +} + + +object GenerateCrossrefDataset{ + val log:Logger = LoggerFactory.getLogger(getClass) + val propertyPath ="/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json" + + def main(args: Array[String]): Unit = { + new GenerateCrossrefDataset(propertyPath,args, log).initialize().run() + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 4789093cd..98f0962f3 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -138,12 +138,11 @@ object DoiBoostMappingUtil { result } - def decideAccessRight(lic: Field[String], date: String): AccessRight = { - if (lic == null) { + def decideAccessRight(license: String, date: String): AccessRight = { + if (license == null || license.isEmpty) { //Default value Unknown return getUnknownQualifier() } - val license: String = lic.getValue //CC licenses if ( license.startsWith("cc") || @@ -305,7 +304,7 @@ object DoiBoostMappingUtil { } def generateDataInfo(): DataInfo = { - generateDataInfo("0.9") + generateDataInfo(0.9F) } def filterPublication(publication: Publication): Boolean = { @@ -330,7 +329,7 @@ object DoiBoostMappingUtil { // fixes #4360 (test publisher) val publisher = - if (publication.getPublisher != null) publication.getPublisher.getValue else null + if (publication.getPublisher != null) publication.getPublisher.getName else null if ( publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher @@ -358,7 +357,7 @@ object DoiBoostMappingUtil { // fixes #4368 if ( authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase( - publication.getPublisher.getValue + publication.getPublisher.getName ) ) return false @@ -374,8 +373,8 @@ object DoiBoostMappingUtil { true } - def generateDataInfo(trust: String): DataInfo = { - val di = new DataInfo + def generateDataInfo(trust: Float): DataInfo = { + val di = new EntityDataInfo di.setDeletedbyinference(false) di.setInferred(false) di.setInvisible(false) @@ -384,8 +383,8 @@ object DoiBoostMappingUtil { OafMapperUtils.qualifier( ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.SYSIMPORT_ACTIONSET, - ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS + ) ) di @@ -393,7 +392,7 @@ object DoiBoostMappingUtil { def createSubject(value: String, classId: String, schemeId: String): Subject = { val s = new Subject - s.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId)) + s.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId)) s.setValue(value) s @@ -403,67 +402,37 @@ object DoiBoostMappingUtil { value: String, classId: String, className: String, - schemeId: String, - schemeName: String + schemeId: String + ): Subject = { val s = new Subject - s.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName)) + s.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId)) s.setValue(value) s } - def createSP( - value: String, - classId: String, - className: String, - schemeId: String, - schemeName: String - ): StructuredProperty = { - val sp = new StructuredProperty - sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName)) - sp.setValue(value) - sp - - } def createSP( value: String, classId: String, className: String, - schemeId: String, - schemeName: String, - dataInfo: DataInfo + schemeId: String ): StructuredProperty = { val sp = new StructuredProperty - sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName)) + sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId)) sp.setValue(value) - sp.setDataInfo(dataInfo) sp } def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { val sp = new StructuredProperty - sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId)) + sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId)) sp.setValue(value) sp - } - def createSP( - value: String, - classId: String, - schemeId: String, - dataInfo: DataInfo - ): StructuredProperty = { - val sp = new StructuredProperty - sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId)) - sp.setValue(value) - sp.setDataInfo(dataInfo) - sp - - } def createCrossrefCollectedFrom(): KeyValue = { @@ -506,13 +475,6 @@ object DoiBoostMappingUtil { } - def asField[T](value: T): Field[T] = { - val tmp = new Field[T] - tmp.setValue(value) - tmp - - } - def isEmpty(x: String) = x == null || x.trim.isEmpty def normalizeDoi(input: String): String = {