WIP: Graph footprint optimisation #287

Draft
claudio.atzori wants to merge 39 commits from ticket_8369 into beta
5 changed files with 512 additions and 53 deletions
Showing only changes of commit 508648e1d8 - Show all commits

View File

@ -0,0 +1,84 @@
package eu.dnetlib.dhp.common.vocabulary;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import static org.mockito.Mockito.lenient;
@ExtendWith(MockitoExtension.class)
public class VocabularyTest {
@Mock
protected ISLookUpService isLookUpService;
protected VocabularyGroup vocabularies;
@BeforeEach
public void setUpVocabulary() throws ISLookUpException, IOException {
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
lenient()
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
.thenReturn(synonyms());
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
}
private static List<String> vocs() throws IOException {
return IOUtils
.readLines(
Objects
.requireNonNull(
VocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt")));
}
private static List<String> synonyms() throws IOException {
return IOUtils
.readLines(
Objects
.requireNonNull(
VocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt")));
}
@Test
void testVocabularyMatch () throws Exception{
final String s= IOUtils.toString(this.getClass().getResourceAsStream("terms"));
for (String s1 : s.split("\n")) {
final Qualifier t1 = vocabularies.getSynonymAsQualifier("dnet:publication_resource", s1);
if (t1 == null) {
System.err.println(s1+ " Missing");
}
else {
System.out.println("syn=" + s1 + " term = " + t1.getClassid());
System.out.println(vocabularies.getSynonymAsQualifier("dnet:result_typologies", t1.getClassid()).getClassname());
}
}
}
}

View File

@ -0,0 +1,34 @@
grant
book
report-series
report-component
book-series
peer-review
component
report
book-track
database
standard
journal-volume
proceedings-series
preprint
book-section
letter
reference-book
edited-book
journal-issue
dataset
reference-entry
dissertation
book-chapter
book-part
journal
book-set
working_paper
dissertation
other
proceedings-article
journal-article
other
proceedings
monograph

View File

@ -0,0 +1,357 @@
package eu.dnetlib.dhp.crossref
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf._
import org.apache.commons.lang.StringUtils
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString, JValue}
import org.json4s.jackson.JsonMethods.parse
import scala.collection.JavaConverters._
case class CrossrefDT(doi: String, json: String, timestamp: Long) {}
object CrossrefUtility {
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
val DOI_PREFIX = "10."
val CROSSREF_COLLECTED_FROM = keyValue(ModelConstants.CROSSREF_ID, ModelConstants.CROSSREF_NAME)
def normalizeDoi(input: String): String = {
if (input == null)
return null
val replaced = input
.replaceAll("(?:\\n|\\r|\\t|\\s)", "")
.toLowerCase
.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
if (replaced == null || replaced.trim.isEmpty)
return null
if (replaced.indexOf("10.") < 0)
return null
val ret = replaced.substring(replaced.indexOf("10."))
if (!ret.startsWith(DOI_PREFIX))
return null
ret
}
def extractDate(dt: String, datePart: List[List[Int]]): String = {
if (StringUtils.isNotBlank(dt))
return GraphCleaningFunctions.cleanDate(dt)
if (datePart != null && datePart.size == 1) {
val res = datePart.head
if (res.size == 3) {
val dp = f"${res.head}-${res(1)}%02d-${res(2)}%02d"
if (dp.length == 10) {
return GraphCleaningFunctions.cleanDate(dp)
}
} else if (res.size == 2) {
val dp = f"${res.head}-${res(1)}%02d-01"
return GraphCleaningFunctions.cleanDate(dp)
} else if (res.size == 1) {
return GraphCleaningFunctions.cleanDate(s"${res.head}-01-01")
}
}
null
}
private def generateDate(
dt: String,
datePart: List[List[Int]],
classId: String,
schemeId: String
): StructuredProperty = {
val dp = extractDate(dt, datePart)
if (StringUtils.isNotBlank(dp))
structuredProperty(dp, classId, classId,schemeId)
else
null
}
private def generateItemFromType(objectType: String, vocabularies:VocabularyGroup): (Result, String) = {
val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, objectType)
if (term != null) {
val resourceType = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, term.getClassid).getClassname
resourceType match {
case "publication" =>(new Publication, resourceType)
case "dataset" =>(new Dataset, resourceType)
case "software" => (new Software, resourceType)
case "otherresearchproduct" =>(new OtherResearchProduct, resourceType)
}
} else
null
}
def convert(input: String, vocabularies:VocabularyGroup): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
var resultList: List[Oaf] = List()
val objectType = (json \ "type").extractOrElse[String](null)
if (objectType == null)
return resultList
val resultWithType = generateItemFromType(objectType, vocabularies)
if (resultWithType == null)
return List()
val result = resultWithType._1
val cOBJCategory = resultWithType._2
mappingResult(result, json, cOBJCategory)
if (result == null || result.getId == null)
return List()
val funderList: List[mappingFunder] =
(json \ "funder").extractOrElse[List[mappingFunder]](List())
if (funderList.nonEmpty) {
resultList = resultList ::: mappingFunderToRelations(
funderList,
result.getId,
createCrossrefCollectedFrom(),
result.getDataInfo,
result.getLastupdatetimestamp
)
}
result match {
case publication: Publication => convertPublication(publication, json, cOBJCategory)
case dataset: Dataset => convertDataset(dataset)
}
resultList = resultList ::: List(result)
resultList
}
def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
//MAPPING Crossref DOI into PID
val doi: String = normalizeDoi((json \ "DOI").extract[String])
result.setPid(
List(
structuredProperty(doi, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES)
).asJava)
//MAPPING Crossref DOI into OriginalId
//and Other Original Identifier of dataset like clinical-trial-number
val clinicalTrialNumbers: List[String] = for (JString(ctr) <- json \ "clinical-trial-number") yield ctr
val alternativeIds: List[String] = for (JString(ids) <- json \ "alternative-id") yield ids
val tmp = clinicalTrialNumbers ::: alternativeIds ::: List(doi)
result.setOriginalId(tmp.filter(id => id != null).asJava)
// Add DataInfo
result.setDataInfo(dataInfo(false, false,0.9F,null, false,ModelConstants.REPOSITORY_PROVENANCE_ACTIONS))
result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long])
result.setDateofcollection((json \ "indexed" \ "date-time").extract[String])
result.setCollectedfrom(List(CROSSREF_COLLECTED_FROM).asJava)
// Publisher ( Name of work's publisher mapped into Result/Publisher)
val publisher = (json \ "publisher").extractOrElse[String](null)
if (publisher != null && publisher.nonEmpty)
result.setPublisher(new Publisher(publisher))
// TITLE
val mainTitles =
for {JString(title) <- json \ "title" if title.nonEmpty}
yield
structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER)
val originalTitles = for {
JString(title) <- json \ "original-title" if title.nonEmpty
} yield structuredProperty(title, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER)
val shortTitles = for {
JString(title) <- json \ "short-title" if title.nonEmpty
} yield structuredProperty(title, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER)
val subtitles =
for {JString(title) <- json \ "subtitle" if title.nonEmpty}
yield structuredProperty(title, ModelConstants.SUBTITLE_QUALIFIER)
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
// DESCRIPTION
val descriptionList =
for {JString(description) <- json \ "abstract"} yield description
result.setDescription(descriptionList.asJava)
// Source
val sourceList = for {
JString(source) <- json \ "source" if source != null && source.nonEmpty
} yield source
result.setSource(sourceList.asJava)
//RELEVANT DATE Mapping
val createdDate = generateDate(
(json \ "created" \ "date-time").extract[String],
(json \ "created" \ "date-parts").extract[List[List[Int]]],
"created",
ModelConstants.DNET_DATACITE_DATE
)
val postedDate = generateDate(
(json \ "posted" \ "date-time").extractOrElse[String](null),
(json \ "posted" \ "date-parts").extract[List[List[Int]]],
"available",
ModelConstants.DNET_DATACITE_DATE
)
val acceptedDate = generateDate(
(json \ "accepted" \ "date-time").extractOrElse[String](null),
(json \ "accepted" \ "date-parts").extract[List[List[Int]]],
"accepted",
ModelConstants.DNET_DATACITE_DATE
)
val publishedPrintDate = generateDate(
(json \ "published-print" \ "date-time").extractOrElse[String](null),
(json \ "published-print" \ "date-parts").extract[List[List[Int]]],
"published-print",
ModelConstants.DNET_DATACITE_DATE
)
val publishedOnlineDate = generateDate(
(json \ "published-online" \ "date-time").extractOrElse[String](null),
(json \ "published-online" \ "date-parts").extract[List[List[Int]]],
"published-online",
ModelConstants.DNET_DATACITE_DATE
)
val issuedDate = extractDate(
(json \ "issued" \ "date-time").extractOrElse[String](null),
(json \ "issued" \ "date-parts").extract[List[List[Int]]]
)
if (StringUtils.isNotBlank(issuedDate)) {
result.setDateofacceptance(issuedDate)
} else {
result.setDateofacceptance(createdDate.getValue)
}
result.setRelevantdate(
List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate)
.filter(p => p != null)
.asJava
)
//Mapping Subject
val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List())
if (subjectList.nonEmpty) {
result.setSubject(
subjectList.map(s => createSubject(s, "keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
)
}
//Mapping Author
val authorList: List[mappingAuthor] =
(json \ "author").extractOrElse[List[mappingAuthor]](List())
val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) =>
a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first")
)
result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) =>
generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)
}.asJava)
// Mapping instance
val instance = new Instance()
val license = for {
JObject(license) <- json \ "license"
JField("URL", JString(lic)) <- license
JField("content-version", JString(content_version)) <- license
} yield (asField(lic), content_version)
val l = license.filter(d => StringUtils.isNotBlank(d._1.getValue))
if (l.nonEmpty) {
if (l exists (d => d._2.equals("vor"))) {
for (d <- l) {
if (d._2.equals("vor")) {
instance.setLicense(d._1)
}
}
} else {
instance.setLicense(l.head._1)
}
}
// Ticket #6281 added pid to Instance
instance.setPid(result.getPid)
val has_review = json \ "relation" \ "has-review" \ "id"
if (has_review != JNothing) {
instance.setRefereed(
OafMapperUtils.qualifier(
"0001",
"peerReviewed",
ModelConstants.DNET_REVIEW_LEVELS,
ModelConstants.DNET_REVIEW_LEVELS
)
)
}
instance.setAccessright(
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
)
instance.setInstancetype(
OafMapperUtils.qualifier(
cobjCategory.substring(0, 4),
cobjCategory.substring(5),
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
result.setResourcetype(
OafMapperUtils.qualifier(
cobjCategory.substring(0, 4),
cobjCategory.substring(5),
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
instance.setCollectedfrom(createCrossrefCollectedFrom())
if (StringUtils.isNotBlank(issuedDate)) {
instance.setDateofacceptance(asField(issuedDate))
} else {
instance.setDateofacceptance(asField(createdDate.getValue))
}
val s: List[String] = List("https://doi.org/" + doi)
// val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct
// if (links.nonEmpty) {
// instance.setUrl(links.asJava)
// }
if (s.nonEmpty) {
instance.setUrl(s.asJava)
}
result.setInstance(List(instance).asJava)
//IMPORTANT
//The old method result.setId(generateIdentifier(result, doi))
//is replaced using IdentifierFactory, but the old identifier
//is preserved among the originalId(s)
val oldId = generateIdentifier(result, doi)
result.setId(oldId)
val newId = IdentifierFactory.createDOIBoostIdentifier(result)
if (!oldId.equalsIgnoreCase(newId)) {
result.getOriginalId.add(oldId)
}
result.setId(newId)
if (result.getId == null)
null
else
result
}
}

View File

@ -0,0 +1,22 @@
package eu.dnetlib.dhp.crossref
import eu.dnetlib.dhp.application.AbstractScalaApplication
import org.slf4j.{Logger, LoggerFactory}
class GenerateCrossrefDataset (propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = ???
}
object GenerateCrossrefDataset{
val log:Logger = LoggerFactory.getLogger(getClass)
val propertyPath ="/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
def main(args: Array[String]): Unit = {
new GenerateCrossrefDataset(propertyPath,args, log).initialize().run()
}
}

View File

@ -138,12 +138,11 @@ object DoiBoostMappingUtil {
result result
} }
def decideAccessRight(lic: Field[String], date: String): AccessRight = { def decideAccessRight(license: String, date: String): AccessRight = {
if (lic == null) { if (license == null || license.isEmpty) {
//Default value Unknown //Default value Unknown
return getUnknownQualifier() return getUnknownQualifier()
} }
val license: String = lic.getValue
//CC licenses //CC licenses
if ( if (
license.startsWith("cc") || license.startsWith("cc") ||
@ -305,7 +304,7 @@ object DoiBoostMappingUtil {
} }
def generateDataInfo(): DataInfo = { def generateDataInfo(): DataInfo = {
generateDataInfo("0.9") generateDataInfo(0.9F)
} }
def filterPublication(publication: Publication): Boolean = { def filterPublication(publication: Publication): Boolean = {
@ -330,7 +329,7 @@ object DoiBoostMappingUtil {
// fixes #4360 (test publisher) // fixes #4360 (test publisher)
val publisher = val publisher =
if (publication.getPublisher != null) publication.getPublisher.getValue else null if (publication.getPublisher != null) publication.getPublisher.getName else null
if ( if (
publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher
@ -358,7 +357,7 @@ object DoiBoostMappingUtil {
// fixes #4368 // fixes #4368
if ( if (
authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase( authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(
publication.getPublisher.getValue publication.getPublisher.getName
) )
) )
return false return false
@ -374,8 +373,8 @@ object DoiBoostMappingUtil {
true true
} }
def generateDataInfo(trust: String): DataInfo = { def generateDataInfo(trust: Float): DataInfo = {
val di = new DataInfo val di = new EntityDataInfo
di.setDeletedbyinference(false) di.setDeletedbyinference(false)
di.setInferred(false) di.setInferred(false)
di.setInvisible(false) di.setInvisible(false)
@ -384,8 +383,8 @@ object DoiBoostMappingUtil {
OafMapperUtils.qualifier( OafMapperUtils.qualifier(
ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.SYSIMPORT_ACTIONSET,
ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.SYSIMPORT_ACTIONSET,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS ModelConstants.DNET_PROVENANCE_ACTIONS
) )
) )
di di
@ -393,7 +392,7 @@ object DoiBoostMappingUtil {
def createSubject(value: String, classId: String, schemeId: String): Subject = { def createSubject(value: String, classId: String, schemeId: String): Subject = {
val s = new Subject val s = new Subject
s.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId)) s.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId))
s.setValue(value) s.setValue(value)
s s
@ -403,67 +402,37 @@ object DoiBoostMappingUtil {
value: String, value: String,
classId: String, classId: String,
className: String, className: String,
schemeId: String, schemeId: String
schemeName: String
): Subject = { ): Subject = {
val s = new Subject val s = new Subject
s.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName)) s.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId))
s.setValue(value) s.setValue(value)
s s
} }
def createSP(
value: String,
classId: String,
className: String,
schemeId: String,
schemeName: String
): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
sp.setValue(value)
sp
}
def createSP( def createSP(
value: String, value: String,
classId: String, classId: String,
className: String, className: String,
schemeId: String, schemeId: String
schemeName: String,
dataInfo: DataInfo
): StructuredProperty = { ): StructuredProperty = {
val sp = new StructuredProperty val sp = new StructuredProperty
sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName)) sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId))
sp.setValue(value) sp.setValue(value)
sp.setDataInfo(dataInfo)
sp sp
} }
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
val sp = new StructuredProperty val sp = new StructuredProperty
sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId)) sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId))
sp.setValue(value) sp.setValue(value)
sp sp
} }
def createSP(
value: String,
classId: String,
schemeId: String,
dataInfo: DataInfo
): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
sp.setValue(value)
sp.setDataInfo(dataInfo)
sp
}
def createCrossrefCollectedFrom(): KeyValue = { def createCrossrefCollectedFrom(): KeyValue = {
@ -506,13 +475,6 @@ object DoiBoostMappingUtil {
} }
def asField[T](value: T): Field[T] = {
val tmp = new Field[T]
tmp.setValue(value)
tmp
}
def isEmpty(x: String) = x == null || x.trim.isEmpty def isEmpty(x: String) = x == null || x.trim.isEmpty
def normalizeDoi(input: String): String = { def normalizeDoi(input: String): String = {