dnet-hadoop/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala

238 lines
8.4 KiB
Scala

package eu.dnetlib.dhp.sx.bio.pubmed
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf._
import java.util.regex.Pattern
import collection.JavaConverters._
/**
*
*/
object PubMedToOaf {
val SUBJ_CLASS = "keywords"
val urlMap = Map(
"pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
"doi" -> "https://dx.doi.org/"
)
val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
/**
* Cleaning the DOI Applying regex in order to
* remove doi starting with URL
*
* @param doi input DOI
* @return cleaned DOI
*/
def cleanDoi(doi: String): String = {
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
val matcher = pattern.matcher(doi)
if (matcher.find) {
return matcher.group(0)
}
null
}
/**
*
* Create an instance of class extends Result
* starting from OAF instanceType value
*
* @param cobjQualifier OAF instance type
* @param vocabularies All dnet vocabularies
* @return the correct instance
*/
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid)
result_typologies.getClassid match {
case "dataset" => new Dataset
case "publication" => new Publication
case "other" => new OtherResearchProduct
case "software" => new Software
case _ => null
}
}
/**
* Mapping the Pubmedjournal info into the OAF Journale
*
* @param j the pubmedJournal
* @return the OAF Journal
*/
def mapJournal(j: PMJournal): Journal = {
if (j == null)
return null
val journal = new Journal
journal.setDataInfo(dataInfo)
journal.setName(j.getTitle)
journal.setConferencedate(j.getDate)
journal.setVol(j.getVolume)
journal.setIssnPrinted(j.getIssn)
journal.setIss(j.getIssue)
journal
}
/**
*
* Find vocabulary term into synonyms and term in the vocabulary
*
* @param vocabularyName the input vocabulary name
* @param vocabularies all the vocabularies
* @param term the term to find
* @return the cleaned term value
*/
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
val b = vocabularies.getTermAsQualifier(vocabularyName, term)
if (a == null) b else a
}
/**
* Map the Pubmed Article into the OAF instance
*
* @param article the pubmed articles
* @param vocabularies the vocabularies
* @return The OAF instance if the mapping did not fail
*/
def convert(article: PMArticle, vocabularies: VocabularyGroup): Result = {
if (article.getPublicationTypes == null)
return null
// MAP PMID into pid with classid = classname = pmid
val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
if (pidList == null)
return null
// MAP //ArticleId[./@IdType="doi"] into alternateIdentifier with classid = classname = doi
var alternateIdentifier: StructuredProperty = null
if (article.getDoi != null) {
val normalizedPid = cleanDoi(article.getDoi)
if (normalizedPid != null)
alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
}
// INSTANCE MAPPING
//--------------------------------------------------------------------------------------
// If the article contains the typology Journal Article then we apply this type
//else We have to find a terms that match the vocabulary otherwise we discard it
val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
val pubmedInstance = new Instance
if (ja.isDefined) {
val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
pubmedInstance.setInstancetype(cojbCategory)
} else {
val i_type = article.getPublicationTypes.asScala
.map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue))
.find(q => q != null)
if (i_type.isDefined)
pubmedInstance.setInstancetype(i_type.get)
else
return null
}
val result = createResult(pubmedInstance.getInstancetype, vocabularies)
if (result == null)
return result
result.setDataInfo(dataInfo)
pubmedInstance.setPid(pidList.asJava)
if (alternateIdentifier != null)
pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava)
result.setInstance(List(pubmedInstance).asJava)
pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut)
//CREATE URL From pmid
val urlLists: List[String] = pidList
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
.filter(t => t._1.nonEmpty)
.map(t => t._1 + t._2)
if (urlLists != null)
pubmedInstance.setUrl(urlLists.asJava)
//ASSIGN DateofAcceptance
pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
//ASSIGN COLLECTEDFROM
pubmedInstance.setCollectedfrom(collectedFrom)
result.setPid(pidList.asJava)
//END INSTANCE MAPPING
//--------------------------------------------------------------------------------------
// JOURNAL MAPPING
//--------------------------------------------------------------------------------------
if (article.getJournal != null && result.isInstanceOf[Publication])
result.asInstanceOf[Publication].setJournal(mapJournal(article.getJournal))
result.setCollectedfrom(List(collectedFrom).asJava)
//END JOURNAL MAPPING
//--------------------------------------------------------------------------------------
// RESULT MAPPING
//--------------------------------------------------------------------------------------
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
if (article.getTitle == null || article.getTitle.isEmpty)
return null
result.setTitle(List(OafMapperUtils.structuredProperty(article.getTitle, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)).asJava)
if (article.getDescription != null && article.getDescription.nonEmpty)
result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava)
if (article.getLanguage != null) {
val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
if (term != null)
result.setLanguage(term)
}
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection.breakOut)
if (subjects != null)
result.setSubject(subjects.asJava)
val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) =>
val author = new Author()
author.setName(a.getForeName)
author.setSurname(a.getLastName)
author.setFullname(a.getFullName)
author.setRank(index + 1)
author
}(collection.breakOut)
if (authors != null && authors.nonEmpty)
result.setAuthor(authors.asJava)
result.setOriginalId(pidList.map(s => s.getValue).asJava)
result.setId(article.getPmid)
// END RESULT MAPPING
//--------------------------------------------------------------------------------------
val id = IdentifierFactory.createIdentifier(result)
if (article.getPmid.equalsIgnoreCase(id))
return null
result.setId(id)
result
}
}