dnet-hadoop/dhp-workflows/dhp-graph-provision-scholex.../src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala

426 lines
17 KiB
Scala

package eu.dnetlib.dhp.export
import com.fasterxml.jackson.databind.ObjectMapper
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import eu.dnetlib.dhp.common.PacePerson
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils._
import scala.collection.JavaConverters._
case class DLIExternalReference(id: String, url: String, sitename: String, label: String, pid: String, classId: String) {}
object DLIToOAF {
val collectedFromMap: Map[String, KeyValue] = Map(
"dli_________::r3d100010527" -> generateKeyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive"),
"dli_________::r3d100010255" -> generateKeyValue("10|re3data_____::480d275ed6f9666ee76d6a1215eabf26", "Inter-university Consortium for Political and Social Research"),
"dli_________::r3d100011868" -> generateKeyValue("10|re3data_____::db814dc656a911b556dba42a331cebe9", "Mendeley Data"),
"dli_________::elsevier" -> generateKeyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier"),
"dli_________::openaire" -> generateKeyValue("10|infrastruct_::f66f1bd369679b5b077dcdf006089556", "OpenAIRE"),
"dli_________::thomsonreuters" -> generateKeyValue("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "Crossref"),
"dli_________::r3d100010216" -> generateKeyValue("10|re3data_____::0fd79429de04343dbbec705d9b5f429f", "4TU.Centre for Research Data"),
"dli_________::r3d100010134" -> generateKeyValue("10|re3data_____::9633d1e8c4309c833c2c442abeb0cfeb", "PANGAEA"),
"dli_________::ieee" -> generateKeyValue("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "Crossref"),
"dli_________::r3d100010197" -> generateKeyValue("10|re3data_____::9fd1d79973f7fda60cbe1d82e3819a68", "The Cambridge Structural Database"),
"dli_________::nature" -> generateKeyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature"),
"dli_________::datacite" -> generateKeyValue("10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "Datacite"),
"dli_________::r3d100010578" -> generateKeyValue("10|re3data_____::c4d751f29a7568011a4c80136b30b444", "IEDA"),
"dli_________::r3d100010464" -> generateKeyValue("10|re3data_____::23e2a81591099828f6b83a1c83150666", "Research Data Australia"),
"dli_________::r3d100010327" -> generateKeyValue("10|re3data_____::a644620b81135243dc9acc15d2362246", "Worldwide Protein Data Bank"),
"dli_________::pubmed" -> generateKeyValue("10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357", "PubMed Central"),
"dli_________::europe_pmc__" -> generateKeyValue("10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c", "Europe PubMed Central"),
"dli_________::crossref" -> generateKeyValue("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "Crossref")
)
val relationTypeMapping: Map[String, (String, String)] = Map(
"IsReferencedBy" -> ("isRelatedTo", "relationship"),
"References" -> ("isRelatedTo", "relationship"),
"IsRelatedTo" -> ("isRelatedTo", "relationship"),
"IsSupplementedBy" -> ("isSupplementedBy", "supplement"),
"Documents"-> ("isRelatedTo", "relationship"),
"Cites" -> ("cites", "citation"),
"Unknown" -> ("isRelatedTo", "relationship"),
"IsSourceOf" -> ("isRelatedTo", "relationship"),
"IsCitedBy" -> ("IsCitedBy", "citation"),
"Reviews" -> ("reviews", "review"),
"Describes" -> ("isRelatedTo", "relationship"),
"HasAssociationWith" -> ("isRelatedTo", "relationship")
)
val expectecdPidType = List("uniprot", "ena", "chembl", "ncbi-n", "ncbi-p", "genbank", "pdb", "url")
val filteredURL = List(
"www.ebi.ac.uk",
"www.uniprot.org",
"f1000.com",
"en.wikipedia.org",
"flybase.org",
"www.yeastgenome.org",
"research.bioinformatics.udel.edu",
"cancer.sanger.ac.uk",
"www.iedb.org",
"www.crd.york.ac.uk",
"www.wormbase.org",
"web.expasy.org",
"www.hal.inserm.fr",
"sabiork.h-its.org",
"zfin.org",
"www.pombase.org",
"www.guidetopharmacology.org",
"reactome.org"
)
val rel_inverse: Map[String, String] = Map(
"isRelatedTo" -> "isRelatedTo",
"isSupplementedBy" -> "isSupplementTo",
"cites" -> "IsCitedBy",
"IsCitedBy" -> "cites",
"reviews" -> "IsReviewedBy"
)
val PidTypeMap: Map[String, String] = Map(
"pbmid" -> "pmid",
"pmcid" -> "pmc",
"pmid" -> "pmid",
"pubmedid" -> "pmid",
"DOI" -> "doi",
"doi" -> "doi"
)
def fixInstance(r:Publication) :Publication = {
val collectedFrom = r.getCollectedfrom.asScala.head
r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom))
r
}
def fixInstanceDataset(r:Dataset) :Dataset = {
val collectedFrom = r.getCollectedfrom.asScala.head
r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom))
r
}
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: Dataset =>
val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
a.setClazz(classOf[Dataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def convertClinicalTrial(dataset: DLIDataset): (String, String) = {
val currentId = generateId(dataset.getId)
val pids = dataset.getPid.asScala.filter(p => "clinicaltrials.gov".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => s"50|r3111dacbab5::${DHPUtils.md5(p.getValue.toLowerCase())}")
if (pids.isEmpty)
null
else
(currentId, pids.head)
}
def insertExternalRefs(publication: Publication, externalReferences: List[DLIExternalReference]): Publication = {
val eRefs = externalReferences.map(e => {
val result = new ExternalReference()
result.setSitename(e.sitename)
result.setLabel(e.label)
result.setUrl(e.url)
result.setRefidentifier(e.pid)
result.setDataInfo(generateDataInfo())
result.setQualifier(createQualifier(e.classId, "dnet:externalReference_typologies"))
result
})
publication.setExternalReference(eRefs.asJava)
publication
}
def filterPid(p: StructuredProperty): Boolean = {
if (expectecdPidType.contains(p.getQualifier.getClassname) && p.getQualifier.getClassname.equalsIgnoreCase("url"))
if (filteredURL.exists(u => p.getValue.contains(u)))
return true
else
return false
expectecdPidType.contains(p.getQualifier.getClassname)
}
def extractTitle(titles: java.util.List[StructuredProperty]): String = {
if (titles == null)
return null
val label = titles.asScala.map(p => p.getValue).find(p => p.nonEmpty)
label.orNull
}
def convertDLIDatasetToExternalReference(dataset: DLIDataset): DLIExternalReference = {
val pids = dataset.getPid.asScala.filter(filterPid)
if (pids == null || pids.isEmpty)
return null
val pid: StructuredProperty = pids.head
pid.getQualifier.getClassname match {
case "uniprot" => DLIExternalReference(generateId(dataset.getId), s"https://www.uniprot.org/uniprot/${pid.getValue}", "UniProt", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "ena" =>
if (pid.getValue != null && pid.getValue.nonEmpty && pid.getValue.length > 7)
DLIExternalReference(generateId(dataset.getId), s"https://www.ebi.ac.uk/ena/data/view/${pid.getValue.substring(0, 8)}", "European Nucleotide Archive", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
else
null
case "chembl" => DLIExternalReference(generateId(dataset.getId), s"https://www.ebi.ac.uk/chembl/compound_report_card/${pid.getValue}", "ChEMBL", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "ncbi-n" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "Nucleotide Database", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "ncbi-p" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "Nucleotide Database", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "genbank" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "GenBank", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "pdb" => DLIExternalReference(generateId(dataset.getId), s"https://www.ncbi.nlm.nih.gov/nuccore/${pid.getValue}", "Protein Data Bank", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "url" => DLIExternalReference(generateId(dataset.getId), pid.getValue, "", extractTitle(dataset.getTitle), pid.getValue, "url")
}
}
def convertDLIPublicationToOAF(inputPublication: DLIPublication): Publication = {
val result = new Publication
val cleanedPids = inputPublication.getPid.asScala.filter(p => PidTypeMap.contains(p.getQualifier.getClassid))
.map(p => {
p.setQualifier(createQualifier(PidTypeMap(p.getQualifier.getClassid), p.getQualifier.getSchemeid))
p
})
if (cleanedPids.isEmpty)
return null
result.setId(generateId(inputPublication.getId))
result.setDataInfo(generateDataInfo(invisibile = true))
if (inputPublication.getCollectedfrom == null || inputPublication.getCollectedfrom.size() == 0 || (inputPublication.getCollectedfrom.size() == 1 && inputPublication.getCollectedfrom.get(0) == null))
return null
result.setCollectedfrom(inputPublication.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava)
if(result.getCollectedfrom.isEmpty)
return null
result.setPid(cleanedPids.asJava)
result.setDateofcollection(inputPublication.getDateofcollection)
result.setOriginalId(inputPublication.getPid.asScala.map(p => p.getValue).asJava)
result.setDateoftransformation(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")))
if (inputPublication.getAuthor == null || inputPublication.getAuthor.isEmpty)
return null
result.setAuthor(inputPublication.getAuthor.asScala.map(convertAuthor).asJava)
result.setResulttype(createQualifier(inputPublication.getResulttype.getClassid, inputPublication.getResulttype.getClassname, "dnet:result_typologies", "dnet:result_typologies"))
if (inputPublication.getSubject != null)
result.setSubject(inputPublication.getSubject.asScala.map(convertSubject).asJava)
if (inputPublication.getTitle == null || inputPublication.getTitle.isEmpty)
return null
result.setTitle(List(patchTitle(inputPublication.getTitle.get(0))).asJava)
if (inputPublication.getRelevantdate == null || inputPublication.getRelevantdate.size() == 0)
return null
result.setRelevantdate(inputPublication.getRelevantdate.asScala.map(patchRelevantDate).asJava)
result.setDescription(inputPublication.getDescription)
result.setDateofacceptance(asField(inputPublication.getRelevantdate.get(0).getValue))
result.setPublisher(inputPublication.getPublisher)
result.setSource(inputPublication.getSource)
result.setBestaccessright(createAccessRight("UNKNOWN", "not available", "dnet:access_modes", "dnet:access_modes"))
val dois = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => p.getValue)
if (dois.isEmpty)
return null
val i: Instance = createInstance(s"https://dx.doi.org/${dois.head}", firstInstanceOrNull(inputPublication.getInstance()), result.getDateofacceptance)
if (i != null)
result.setInstance(List(i).asJava)
result
}
def convertDLIRelation(r: Relation): Relation = {
val rt = r.getRelType
if (!relationTypeMapping.contains(rt))
return null
r.setRelType("resultResult")
r.setRelClass(relationTypeMapping(rt)._1)
r.setSubRelType(relationTypeMapping(rt)._2)
r.setSource(generateId(r.getSource))
r.setTarget(generateId(r.getTarget))
r
}
def convertDLIDatasetTOOAF(d: DLIDataset): Dataset = {
if (d.getCollectedfrom == null || d.getCollectedfrom.size() == 0 || (d.getCollectedfrom.size() == 1 && d.getCollectedfrom.get(0) == null))
return null
val result: Dataset = new Dataset
result.setId(generateId(d.getId))
result.setDataInfo(generateDataInfo())
result.setCollectedfrom(d.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava)
if(result.getCollectedfrom.isEmpty)
return null
result.setPid(d.getPid)
val fpids = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname) ||
"pdb".equalsIgnoreCase(p.getQualifier.getClassname)
).map(p => p.getValue)
if (fpids == null || fpids.isEmpty)
return null
result.setDateofcollection(d.getDateofcollection)
result.setOriginalId(d.getPid.asScala.map(d => d.getValue).asJava)
result.setDateoftransformation(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")))
if (d.getAuthor == null || d.getAuthor.isEmpty)
return null
result.setAuthor(d.getAuthor.asScala.map(convertAuthor).asJava)
result.setResulttype(createQualifier(d.getResulttype.getClassid, d.getResulttype.getClassname, "dnet:result_typologies", "dnet:result_typologies"))
if (d.getSubject != null)
result.setSubject(d.getSubject.asScala.map(convertSubject).asJava)
if (d.getTitle == null || d.getTitle.isEmpty)
return null
result.setTitle(List(patchTitle(d.getTitle.get(0))).asJava)
if (d.getRelevantdate == null || d.getRelevantdate.size() == 0)
return null
result.setRelevantdate(d.getRelevantdate.asScala.map(patchRelevantDate).asJava)
result.setDescription(d.getDescription)
result.setDateofacceptance(asField(d.getRelevantdate.get(0).getValue))
result.setPublisher(d.getPublisher)
result.setSource(d.getSource)
result.setBestaccessright(createAccessRight("UNKNOWN", "not available", "dnet:access_modes", "dnet:access_modes"))
val instance_urls = if (fpids.head.length < 5) s"https://www.rcsb.org/structure/${fpids.head}" else s"https://dx.doi.org/${fpids.head}"
val i: Instance = createInstance(instance_urls, firstInstanceOrNull(d.getInstance()), result.getDateofacceptance, true)
// Ticket #6281 added pid to Instance
i.setPid(result.getPid)
if (i != null)
result.setInstance(List(i).asJava)
result
}
def firstInstanceOrNull(instances: java.util.List[Instance]): Instance = {
if (instances == null || instances.size() == 0)
return null
instances.get(0)
}
def createInstance(url: String, originalInstance: Instance, doa: Field[String], dataset: Boolean = false): Instance = {
val i = new Instance
i.setUrl(List(url).asJava)
if (dataset)
i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource"))
else
i.setInstancetype(createQualifier("0000", "Unknown", "dnet:publication_resource", "dnet:publication_resource"))
if (originalInstance != null && originalInstance.getHostedby != null)
i.setHostedby(originalInstance.getHostedby)
i.setAccessright(createAccessRight("UNKNOWN", "not available", "dnet:access_modes", "dnet:access_modes"))
i.setDateofacceptance(doa)
i
}
def patchRelevantDate(d: StructuredProperty): StructuredProperty = {
d.setQualifier(createQualifier("UNKNOWN", "dnet:dataCite_date"))
d
}
def patchTitle(t: StructuredProperty): StructuredProperty = {
t.setQualifier(createQualifier("main title", "dnet:dataCite_title"))
t
}
def convertSubject(s: StructuredProperty): StructuredProperty = {
s.setQualifier(createQualifier("keyword", "dnet:subject_classification_typologies"))
s
}
def convertAuthor(a: Author): Author = {
if (a == null)
return a
val p = new PacePerson(a.getFullname, false)
if (p.isAccurate) {
a.setName(p.getNameString)
a.setSurname(p.getSurnameString)
}
a
}
def generateId(id: String): String = {
val md5 = if (id.contains("::")) StringUtils.substringAfter(id, "::") else StringUtils.substringAfter(id, "|")
s"50|scholix_____::$md5"
}
}