2021-10-15 15:00:15 +02:00
|
|
|
package eu.dnetlib.dhp.sx.bio.pubmed
|
2021-10-12 08:11:53 +02:00
|
|
|
|
2021-06-16 14:56:24 +02:00
|
|
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
|
|
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
2022-01-12 09:40:28 +01:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
|
2021-10-12 08:11:53 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf._
|
2022-07-13 15:27:17 +02:00
|
|
|
import eu.dnetlib.dhp.utils.DHPUtils
|
|
|
|
import org.apache.commons.lang3.StringUtils
|
2021-06-29 12:02:03 +02:00
|
|
|
|
2022-07-13 15:27:17 +02:00
|
|
|
import collection.JavaConverters._
|
2021-08-30 09:32:21 +02:00
|
|
|
import java.util.regex.Pattern
|
2022-07-13 15:27:17 +02:00
|
|
|
import scala.collection.mutable.ListBuffer
|
2021-06-16 14:56:24 +02:00
|
|
|
|
2021-11-15 14:32:01 +01:00
|
|
|
/**
|
2022-01-11 16:57:48 +01:00
|
|
|
*/
|
2021-06-16 14:56:24 +02:00
|
|
|
object PubMedToOaf {
|
|
|
|
|
|
|
|
val SUBJ_CLASS = "keywords"
|
2022-01-11 16:57:48 +01:00
|
|
|
|
2022-07-13 15:27:17 +02:00
|
|
|
val OAI_HEADER = "oai:pubmedcentral.nih.gov:"
|
|
|
|
val OLD_PMC_PREFIX = "od_______267::"
|
|
|
|
|
2021-06-16 14:56:24 +02:00
|
|
|
val urlMap = Map(
|
|
|
|
"pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
|
2022-01-11 16:57:48 +01:00
|
|
|
"doi" -> "https://dx.doi.org/"
|
|
|
|
)
|
|
|
|
|
|
|
|
val dataInfo: DataInfo = OafMapperUtils.dataInfo(
|
|
|
|
false,
|
|
|
|
null,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
|
|
|
"0.9"
|
2021-06-16 14:56:24 +02:00
|
|
|
)
|
2021-11-15 14:32:01 +01:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val collectedFrom: KeyValue =
|
|
|
|
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
2021-06-16 14:56:24 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
/** Cleaning the DOI Applying regex in order to
|
|
|
|
* remove doi starting with URL
|
|
|
|
*
|
|
|
|
* @param doi input DOI
|
|
|
|
* @return cleaned DOI
|
|
|
|
*/
|
2021-10-12 08:11:53 +02:00
|
|
|
def cleanDoi(doi: String): String = {
|
2021-08-24 15:20:04 +02:00
|
|
|
|
2021-08-30 09:32:21 +02:00
|
|
|
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
|
2021-08-24 15:20:04 +02:00
|
|
|
|
|
|
|
val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
|
|
|
|
val matcher = pattern.matcher(doi)
|
|
|
|
|
|
|
|
if (matcher.find) {
|
|
|
|
return matcher.group(0)
|
|
|
|
}
|
|
|
|
null
|
|
|
|
}
|
|
|
|
|
2022-07-29 11:56:01 +02:00
|
|
|
def createOriginalOpenaireId(article: PMArticle): String = {
|
2022-07-13 15:27:17 +02:00
|
|
|
if (StringUtils.isNotEmpty(article.getPmcId)) {
|
2022-07-29 11:56:01 +02:00
|
|
|
val md5 = DHPUtils.md5(s"$OAI_HEADER${article.getPmcId.replace("PMC", "")}")
|
2022-07-13 15:27:17 +02:00
|
|
|
s"$OLD_PMC_PREFIX$md5"
|
2022-07-29 11:56:01 +02:00
|
|
|
} else
|
2022-07-13 15:27:17 +02:00
|
|
|
null
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
/** Create an instance of class extends Result
|
|
|
|
* starting from OAF instanceType value
|
|
|
|
*
|
|
|
|
* @param cobjQualifier OAF instance type
|
|
|
|
* @param vocabularies All dnet vocabularies
|
|
|
|
* @return the correct instance
|
|
|
|
*/
|
2021-06-16 14:56:24 +02:00
|
|
|
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
|
2022-01-11 16:57:48 +01:00
|
|
|
val result_typologies = getVocabularyTerm(
|
|
|
|
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
|
|
|
vocabularies,
|
|
|
|
cobjQualifier.getClassid
|
|
|
|
)
|
2021-06-16 14:56:24 +02:00
|
|
|
result_typologies.getClassid match {
|
2022-01-11 16:57:48 +01:00
|
|
|
case "dataset" => new Dataset
|
2021-06-16 14:56:24 +02:00
|
|
|
case "publication" => new Publication
|
2022-01-11 16:57:48 +01:00
|
|
|
case "other" => new OtherResearchProduct
|
|
|
|
case "software" => new Software
|
|
|
|
case _ => null
|
2021-06-16 14:56:24 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
/** Mapping the Pubmedjournal info into the OAF Journale
|
|
|
|
*
|
|
|
|
* @param j the pubmedJournal
|
|
|
|
* @return the OAF Journal
|
|
|
|
*/
|
2021-06-16 14:56:24 +02:00
|
|
|
def mapJournal(j: PMJournal): Journal = {
|
|
|
|
if (j == null)
|
|
|
|
return null
|
|
|
|
val journal = new Journal
|
|
|
|
|
|
|
|
journal.setDataInfo(dataInfo)
|
|
|
|
journal.setName(j.getTitle)
|
2021-11-15 14:32:01 +01:00
|
|
|
journal.setConferencedate(j.getDate)
|
2021-06-16 14:56:24 +02:00
|
|
|
journal.setVol(j.getVolume)
|
|
|
|
journal.setIssnPrinted(j.getIssn)
|
|
|
|
journal.setIss(j.getIssue)
|
|
|
|
journal
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
/** Find vocabulary term into synonyms and term in the vocabulary
|
|
|
|
*
|
|
|
|
* @param vocabularyName the input vocabulary name
|
|
|
|
* @param vocabularies all the vocabularies
|
|
|
|
* @param term the term to find
|
|
|
|
* @return the cleaned term value
|
|
|
|
*/
|
|
|
|
def getVocabularyTerm(
|
|
|
|
vocabularyName: String,
|
|
|
|
vocabularies: VocabularyGroup,
|
|
|
|
term: String
|
|
|
|
): Qualifier = {
|
2021-06-16 14:56:24 +02:00
|
|
|
val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
|
|
|
|
val b = vocabularies.getTermAsQualifier(vocabularyName, term)
|
|
|
|
if (a == null) b else a
|
|
|
|
}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
/** Map the Pubmed Article into the OAF instance
|
|
|
|
*
|
|
|
|
* @param article the pubmed articles
|
|
|
|
* @param vocabularies the vocabularies
|
|
|
|
* @return The OAF instance if the mapping did not fail
|
|
|
|
*/
|
2022-01-03 17:25:26 +01:00
|
|
|
def convert(article: PMArticle, vocabularies: VocabularyGroup): Oaf = {
|
2021-06-16 14:56:24 +02:00
|
|
|
|
|
|
|
if (article.getPublicationTypes == null)
|
|
|
|
return null
|
2021-11-15 14:32:01 +01:00
|
|
|
|
|
|
|
// MAP PMID into pid with classid = classname = pmid
|
2022-07-13 15:27:17 +02:00
|
|
|
val pidList = ListBuffer[StructuredProperty]()
|
|
|
|
|
|
|
|
pidList += OafMapperUtils.structuredProperty(
|
2022-07-29 11:56:01 +02:00
|
|
|
article.getPmid,
|
|
|
|
PidType.pmid.toString,
|
|
|
|
PidType.pmid.toString,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
dataInfo
|
|
|
|
)
|
|
|
|
|
|
|
|
if (StringUtils.isNotBlank(article.getPmcId)) {
|
|
|
|
pidList += OafMapperUtils.structuredProperty(
|
|
|
|
article.getPmcId,
|
|
|
|
PidType.pmc.toString,
|
|
|
|
PidType.pmc.toString,
|
2022-01-11 16:57:48 +01:00
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
dataInfo
|
|
|
|
)
|
2022-07-29 11:56:01 +02:00
|
|
|
}
|
2021-06-18 16:41:24 +02:00
|
|
|
if (pidList == null)
|
2021-06-16 14:56:24 +02:00
|
|
|
return null
|
2021-08-24 15:20:04 +02:00
|
|
|
|
2021-11-15 14:32:01 +01:00
|
|
|
// MAP //ArticleId[./@IdType="doi"] into alternateIdentifier with classid = classname = doi
|
2021-10-12 08:11:53 +02:00
|
|
|
var alternateIdentifier: StructuredProperty = null
|
2021-06-16 14:56:24 +02:00
|
|
|
if (article.getDoi != null) {
|
2021-08-24 15:20:04 +02:00
|
|
|
val normalizedPid = cleanDoi(article.getDoi)
|
2021-10-12 08:11:53 +02:00
|
|
|
if (normalizedPid != null)
|
2022-01-11 16:57:48 +01:00
|
|
|
alternateIdentifier = OafMapperUtils.structuredProperty(
|
|
|
|
normalizedPid,
|
|
|
|
PidType.doi.toString,
|
|
|
|
PidType.doi.toString,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
dataInfo
|
|
|
|
)
|
2021-06-16 14:56:24 +02:00
|
|
|
}
|
|
|
|
|
2021-11-15 14:32:01 +01:00
|
|
|
// INSTANCE MAPPING
|
|
|
|
//--------------------------------------------------------------------------------------
|
|
|
|
|
2021-06-16 14:56:24 +02:00
|
|
|
// If the article contains the typology Journal Article then we apply this type
|
|
|
|
//else We have to find a terms that match the vocabulary otherwise we discard it
|
2022-01-11 16:57:48 +01:00
|
|
|
val ja =
|
|
|
|
article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
|
2021-11-15 14:32:01 +01:00
|
|
|
val pubmedInstance = new Instance
|
2021-06-16 14:56:24 +02:00
|
|
|
if (ja.isDefined) {
|
2022-01-11 16:57:48 +01:00
|
|
|
val cojbCategory =
|
|
|
|
getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
|
2021-11-15 14:32:01 +01:00
|
|
|
pubmedInstance.setInstancetype(cojbCategory)
|
2021-06-16 14:56:24 +02:00
|
|
|
} else {
|
|
|
|
val i_type = article.getPublicationTypes.asScala
|
2022-01-12 09:40:28 +01:00
|
|
|
.map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue))
|
2021-06-16 14:56:24 +02:00
|
|
|
.find(q => q != null)
|
|
|
|
if (i_type.isDefined)
|
2021-11-15 14:32:01 +01:00
|
|
|
pubmedInstance.setInstancetype(i_type.get)
|
2021-06-16 14:56:24 +02:00
|
|
|
else
|
|
|
|
return null
|
|
|
|
}
|
2021-11-15 14:32:01 +01:00
|
|
|
val result = createResult(pubmedInstance.getInstancetype, vocabularies)
|
2021-06-16 14:56:24 +02:00
|
|
|
if (result == null)
|
|
|
|
return result
|
|
|
|
result.setDataInfo(dataInfo)
|
2021-11-15 14:32:01 +01:00
|
|
|
pubmedInstance.setPid(pidList.asJava)
|
2021-10-12 08:11:53 +02:00
|
|
|
if (alternateIdentifier != null)
|
2021-11-15 14:32:01 +01:00
|
|
|
pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava)
|
|
|
|
result.setInstance(List(pubmedInstance).asJava)
|
2022-01-11 16:57:48 +01:00
|
|
|
pubmedInstance.getPid.asScala
|
|
|
|
.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid))
|
|
|
|
.map(p => p.getValue)(collection.breakOut)
|
2021-11-15 14:32:01 +01:00
|
|
|
//CREATE URL From pmid
|
2021-06-16 14:56:24 +02:00
|
|
|
val urlLists: List[String] = pidList
|
|
|
|
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
|
|
|
|
.filter(t => t._1.nonEmpty)
|
2022-07-13 15:27:17 +02:00
|
|
|
.toList
|
2021-06-16 14:56:24 +02:00
|
|
|
.map(t => t._1 + t._2)
|
2021-06-18 16:41:24 +02:00
|
|
|
if (urlLists != null)
|
2021-11-15 14:32:01 +01:00
|
|
|
pubmedInstance.setUrl(urlLists.asJava)
|
|
|
|
|
|
|
|
//ASSIGN DateofAcceptance
|
2022-01-11 16:57:48 +01:00
|
|
|
pubmedInstance.setDateofacceptance(
|
|
|
|
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
|
|
|
|
)
|
2021-11-15 14:32:01 +01:00
|
|
|
//ASSIGN COLLECTEDFROM
|
|
|
|
pubmedInstance.setCollectedfrom(collectedFrom)
|
2021-06-16 14:56:24 +02:00
|
|
|
result.setPid(pidList.asJava)
|
2021-11-15 14:32:01 +01:00
|
|
|
|
|
|
|
//END INSTANCE MAPPING
|
|
|
|
//--------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
// JOURNAL MAPPING
|
|
|
|
//--------------------------------------------------------------------------------------
|
2021-06-16 14:56:24 +02:00
|
|
|
if (article.getJournal != null && result.isInstanceOf[Publication])
|
|
|
|
result.asInstanceOf[Publication].setJournal(mapJournal(article.getJournal))
|
|
|
|
result.setCollectedfrom(List(collectedFrom).asJava)
|
2021-11-15 14:32:01 +01:00
|
|
|
//END JOURNAL MAPPING
|
|
|
|
//--------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
// RESULT MAPPING
|
|
|
|
//--------------------------------------------------------------------------------------
|
2022-01-11 16:57:48 +01:00
|
|
|
result.setDateofacceptance(
|
|
|
|
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
|
|
|
|
)
|
2021-06-16 14:56:24 +02:00
|
|
|
|
|
|
|
if (article.getTitle == null || article.getTitle.isEmpty)
|
|
|
|
return null
|
2022-01-11 16:57:48 +01:00
|
|
|
result.setTitle(
|
|
|
|
List(
|
|
|
|
OafMapperUtils.structuredProperty(
|
|
|
|
article.getTitle,
|
|
|
|
ModelConstants.MAIN_TITLE_QUALIFIER,
|
|
|
|
dataInfo
|
|
|
|
)
|
|
|
|
).asJava
|
|
|
|
)
|
2021-06-16 14:56:24 +02:00
|
|
|
|
|
|
|
if (article.getDescription != null && article.getDescription.nonEmpty)
|
|
|
|
result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava)
|
|
|
|
|
|
|
|
if (article.getLanguage != null) {
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val term =
|
|
|
|
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
|
2021-06-16 14:56:24 +02:00
|
|
|
if (term != null)
|
|
|
|
result.setLanguage(term)
|
|
|
|
}
|
|
|
|
|
2022-08-04 11:39:39 +02:00
|
|
|
val subjects: List[Subject] = article.getSubjects.asScala.map(s =>
|
|
|
|
OafMapperUtils.subject(
|
2022-01-11 16:57:48 +01:00
|
|
|
s.getValue,
|
|
|
|
SUBJ_CLASS,
|
|
|
|
SUBJ_CLASS,
|
|
|
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
|
|
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
|
|
|
dataInfo
|
|
|
|
)
|
|
|
|
)(collection.breakOut)
|
2021-06-18 16:41:24 +02:00
|
|
|
if (subjects != null)
|
2021-06-16 14:56:24 +02:00
|
|
|
result.setSubject(subjects.asJava)
|
|
|
|
|
2021-06-18 16:41:24 +02:00
|
|
|
val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) =>
|
2021-06-16 14:56:24 +02:00
|
|
|
val author = new Author()
|
|
|
|
author.setName(a.getForeName)
|
|
|
|
author.setSurname(a.getLastName)
|
|
|
|
author.setFullname(a.getFullName)
|
|
|
|
author.setRank(index + 1)
|
|
|
|
author
|
2021-10-12 08:11:53 +02:00
|
|
|
}(collection.breakOut)
|
2021-06-16 14:56:24 +02:00
|
|
|
|
2021-06-18 16:41:24 +02:00
|
|
|
if (authors != null && authors.nonEmpty)
|
2021-06-16 14:56:24 +02:00
|
|
|
result.setAuthor(authors.asJava)
|
2022-07-13 15:27:17 +02:00
|
|
|
|
|
|
|
if (StringUtils.isNotEmpty(article.getPmcId)) {
|
|
|
|
val originalIDS = ListBuffer[String]()
|
|
|
|
originalIDS += createOriginalOpenaireId(article)
|
2022-07-29 11:56:01 +02:00
|
|
|
pidList.map(s => s.getValue).foreach(p => originalIDS += p)
|
2022-07-13 15:27:17 +02:00
|
|
|
result.setOriginalId(originalIDS.asJava)
|
|
|
|
} else
|
|
|
|
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
2021-06-16 14:56:24 +02:00
|
|
|
|
|
|
|
result.setId(article.getPmid)
|
|
|
|
|
2021-11-15 14:32:01 +01:00
|
|
|
// END RESULT MAPPING
|
|
|
|
//--------------------------------------------------------------------------------------
|
2021-06-16 14:56:24 +02:00
|
|
|
val id = IdentifierFactory.createIdentifier(result)
|
|
|
|
if (article.getPmid.equalsIgnoreCase(id))
|
|
|
|
return null
|
|
|
|
result.setId(id)
|
|
|
|
result
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|