2020-05-13 10:38:04 +02:00
|
|
|
package eu.dnetlib.doiboost.mag
|
|
|
|
|
|
|
|
|
2021-03-31 18:33:57 +02:00
|
|
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
2020-11-09 11:53:55 +01:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
2020-05-28 09:57:46 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
|
|
|
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
2020-05-13 10:38:04 +02:00
|
|
|
import org.json4s
|
|
|
|
import org.json4s.DefaultFormats
|
|
|
|
import org.json4s.jackson.JsonMethods.parse
|
2020-05-19 09:24:45 +02:00
|
|
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
2020-05-13 10:38:04 +02:00
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
import scala.collection.JavaConverters._
|
|
|
|
import scala.collection.mutable
|
|
|
|
import scala.util.matching.Regex
|
2020-05-13 10:38:04 +02:00
|
|
|
|
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
case class MagPapers(PaperId: Long, Rank: Integer, Doi: String,
|
|
|
|
DocType: String, PaperTitle: String, OriginalTitle: String,
|
|
|
|
BookTitle: String, Year: Option[Integer], Date: Option[java.sql.Timestamp], Publisher: String,
|
|
|
|
JournalId: Option[Long], ConferenceSeriesId: Option[Long], ConferenceInstanceId: Option[Long],
|
|
|
|
Volume: String, Issue: String, FirstPage: String, LastPage: String,
|
|
|
|
ReferenceCount: Option[Long], CitationCount: Option[Long], EstimatedCitation: Option[Long],
|
|
|
|
OriginalVenue: String, FamilyId: Option[Long], CreatedDate: java.sql.Timestamp) {}
|
2020-05-13 10:38:04 +02:00
|
|
|
|
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {}
|
|
|
|
|
|
|
|
case class MagAuthor(AuthorId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], LastKnownAffiliationId: Option[Long], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
|
|
|
|
|
|
|
|
case class MagAffiliation(AffiliationId: Long, Rank: Int, NormalizedName: String, DisplayName: String, GridId: String, OfficialPage: String, WikiPage: String, PaperCount: Long, CitationCount: Long, Latitude: Option[Float], Longitude: Option[Float], CreatedDate: java.sql.Timestamp) {}
|
|
|
|
|
|
|
|
case class MagPaperAuthorAffiliation(PaperId: Long, AuthorId: Long, AffiliationId: Option[Long], AuthorSequenceNumber: Int, OriginalAuthor: String, OriginalAffiliation: String) {}
|
|
|
|
|
|
|
|
|
2021-03-11 11:32:32 +01:00
|
|
|
case class MagAuthorAffiliation(author: MagAuthor, affiliation:String, sequenceNumber:Int)
|
2020-05-19 09:24:45 +02:00
|
|
|
|
|
|
|
case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {}
|
|
|
|
|
2021-03-11 11:32:32 +01:00
|
|
|
case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String, sequenceNumber:Int) {}
|
2020-05-19 09:24:45 +02:00
|
|
|
|
|
|
|
case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option[String], LanguageCode: Option[String]) {}
|
|
|
|
|
2020-05-22 15:15:09 +02:00
|
|
|
case class MagUrlInstance(SourceUrl:String){}
|
|
|
|
|
|
|
|
case class MagUrl(PaperId: Long, instances: List[MagUrlInstance])
|
2020-05-19 09:24:45 +02:00
|
|
|
|
2020-05-20 08:14:03 +02:00
|
|
|
case class MagSubject(FieldOfStudyId:Long, DisplayName:String, MainType:Option[String], Score:Float){}
|
|
|
|
|
|
|
|
case class MagFieldOfStudy(PaperId:Long, subjects:List[MagSubject]) {}
|
2020-05-19 09:24:45 +02:00
|
|
|
|
|
|
|
case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
|
2020-05-13 10:38:04 +02:00
|
|
|
|
|
|
|
|
2020-05-22 15:15:09 +02:00
|
|
|
case class MagConferenceInstance(ci:Long, DisplayName:Option[String], Location:Option[String], StartDate:Option[java.sql.Timestamp], EndDate:Option[java.sql.Timestamp], PaperId:Long){}
|
|
|
|
|
2020-05-13 10:38:04 +02:00
|
|
|
case object ConversionUtil {
|
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
def extractMagIdentifier(pids:mutable.Buffer[String]) :String ={
|
|
|
|
val magIDRegex: Regex = "^[0-9]+$".r
|
|
|
|
val s =pids.filter(p=> magIDRegex.findAllIn(p).hasNext)
|
|
|
|
|
|
|
|
if (s.nonEmpty)
|
|
|
|
return s.head
|
|
|
|
null
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-05-28 09:57:46 +02:00
|
|
|
def mergePublication(a: Publication, b:Publication) : Publication = {
|
|
|
|
if ((a != null) && (b != null)) {
|
|
|
|
a.mergeFrom(b)
|
|
|
|
a
|
|
|
|
} else {
|
|
|
|
if (a == null) b else a
|
|
|
|
}
|
2020-05-29 09:32:04 +02:00
|
|
|
|
|
|
|
|
2020-05-28 09:57:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
def choiceLatestMagArtitcle(p1: MagPapers, p2:MagPapers) :MagPapers = {
|
|
|
|
var r = if (p1 == null) p2 else p1
|
|
|
|
if (p1 != null && p2 != null) {
|
|
|
|
if (p1.CreatedDate != null && p2.CreatedDate != null) {
|
|
|
|
if (p1.CreatedDate.before(p2.CreatedDate))
|
|
|
|
r = p2
|
|
|
|
else
|
|
|
|
r = p1
|
|
|
|
} else {
|
|
|
|
r = if (p1.CreatedDate == null) p2 else p1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
r
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def updatePubsWithDescription(inputItem:((String, Publication), MagPaperAbstract)) : Publication = {
|
|
|
|
val pub = inputItem._1._2
|
|
|
|
val abst = inputItem._2
|
|
|
|
if (abst != null) {
|
|
|
|
pub.setDescription(List(asField(abst.IndexedAbstract)).asJava)
|
|
|
|
}
|
|
|
|
pub
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def updatePubsWithConferenceInfo(inputItem:((String, Publication), MagConferenceInstance)) : Publication = {
|
|
|
|
val publication:Publication= inputItem._1._2
|
|
|
|
val ci:MagConferenceInstance = inputItem._2
|
|
|
|
|
|
|
|
if (ci!= null){
|
|
|
|
|
|
|
|
val j:Journal = new Journal
|
|
|
|
if (ci.Location.isDefined)
|
|
|
|
j.setConferenceplace(ci.Location.get)
|
|
|
|
j.setName(ci.DisplayName.get)
|
|
|
|
if (ci.StartDate.isDefined && ci.EndDate.isDefined)
|
|
|
|
{
|
2020-06-04 14:39:20 +02:00
|
|
|
j.setConferencedate(s"${ci.StartDate.get.toString.substring(0,10)} - ${ci.EndDate.get.toString.substring(0,10)}")
|
2020-05-28 09:57:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
publication.setJournal(j)
|
|
|
|
}
|
|
|
|
publication
|
|
|
|
}
|
|
|
|
|
|
|
|
def updatePubsWithSubject(item:((String, Publication), MagFieldOfStudy)) : Publication = {
|
|
|
|
|
|
|
|
val publication = item._1._2
|
|
|
|
val fieldOfStudy = item._2
|
|
|
|
if (fieldOfStudy != null && fieldOfStudy.subjects != null && fieldOfStudy.subjects.nonEmpty) {
|
2021-05-17 09:54:45 +02:00
|
|
|
|
|
|
|
val className = "Microsoft Academic Graph classification"
|
|
|
|
val classid = "MAG"
|
|
|
|
|
2020-05-28 09:57:46 +02:00
|
|
|
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
|
2021-05-17 09:54:45 +02:00
|
|
|
val s1 = createSP(s.DisplayName, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
|
2020-05-28 09:57:46 +02:00
|
|
|
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
|
|
|
|
var resList: List[StructuredProperty] = List(s1)
|
|
|
|
if (s.MainType.isDefined) {
|
|
|
|
val maintp = s.MainType.get
|
2021-05-17 09:54:45 +02:00
|
|
|
val s2 = createSP(s.MainType.get, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
|
2020-05-28 09:57:46 +02:00
|
|
|
s2.setDataInfo(di)
|
|
|
|
resList = resList ::: List(s2)
|
|
|
|
if (maintp.contains(".")) {
|
2021-05-17 09:54:45 +02:00
|
|
|
val s3 = createSP(maintp.split("\\.").head, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
|
2020-05-28 09:57:46 +02:00
|
|
|
s3.setDataInfo(di)
|
|
|
|
resList = resList ::: List(s3)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
resList
|
|
|
|
})
|
|
|
|
publication.setSubject(p.asJava)
|
|
|
|
}
|
|
|
|
publication
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
|
|
|
|
def addInstances(a: (Publication, MagUrl)): Publication = {
|
|
|
|
val pub = a._1
|
|
|
|
val urls = a._2
|
|
|
|
|
|
|
|
|
2020-05-28 09:57:46 +02:00
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
val i = new Instance
|
|
|
|
|
2020-05-13 10:38:04 +02:00
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
if (urls!= null) {
|
2020-05-13 10:38:04 +02:00
|
|
|
|
2020-05-22 15:15:09 +02:00
|
|
|
val l:List[String] = urls.instances.filter(k=>k.SourceUrl.nonEmpty).map(k=>k.SourceUrl):::List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}")
|
2020-05-19 09:24:45 +02:00
|
|
|
|
|
|
|
i.setUrl(l.asJava)
|
|
|
|
}
|
|
|
|
else
|
|
|
|
i.setUrl(List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}").asJava)
|
|
|
|
|
2021-01-12 14:42:30 +01:00
|
|
|
// Ticket #6281 added pid to Instance
|
2021-03-17 12:12:56 +01:00
|
|
|
i.setPid(pub.getPid)
|
2021-01-12 14:42:30 +01:00
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
i.setCollectedfrom(createMAGCollectedFrom())
|
|
|
|
pub.setInstance(List(i).asJava)
|
|
|
|
pub
|
2020-05-13 10:38:04 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = {
|
|
|
|
MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
|
|
|
|
}
|
2020-05-13 10:38:04 +02:00
|
|
|
|
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
def createOAFFromJournalAuthorPaper(inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)): Publication = {
|
|
|
|
val paper = inputParams._1._1
|
|
|
|
val journal = inputParams._1._2
|
|
|
|
val authors = inputParams._2
|
2020-05-13 10:38:04 +02:00
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
val pub = new Publication
|
2021-06-29 19:07:23 +02:00
|
|
|
pub.setPid(List(createSP(paper.Doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
|
|
|
|
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi).asJava)
|
2020-05-19 09:24:45 +02:00
|
|
|
|
2020-11-09 11:53:55 +01:00
|
|
|
//IMPORTANT
|
|
|
|
//The old method result.setId(generateIdentifier(result, doi))
|
|
|
|
//will be replaced using IdentifierFactory
|
2021-03-17 15:53:24 +01:00
|
|
|
|
|
|
|
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
|
2020-05-19 09:24:45 +02:00
|
|
|
|
2021-03-31 18:33:57 +02:00
|
|
|
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
|
|
|
|
val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
2020-05-19 09:24:45 +02:00
|
|
|
pub.setTitle(List(mainTitles, originalTitles).asJava)
|
|
|
|
|
|
|
|
pub.setSource(List(asField(paper.BookTitle)).asJava)
|
|
|
|
|
|
|
|
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
|
|
|
|
|
|
|
|
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
|
2021-03-11 11:32:32 +01:00
|
|
|
a.setRank(f.sequenceNumber)
|
|
|
|
if (f.author.DisplayName.isDefined)
|
|
|
|
a.setFullname(f.author.DisplayName.get)
|
2020-05-19 09:24:45 +02:00
|
|
|
if(f.affiliation!= null)
|
|
|
|
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
2021-03-31 18:33:57 +02:00
|
|
|
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava)
|
2020-05-19 09:24:45 +02:00
|
|
|
a
|
|
|
|
}
|
|
|
|
pub.setAuthor(authorsOAF.asJava)
|
|
|
|
|
|
|
|
|
|
|
|
if (paper.Date != null && paper.Date.isDefined) {
|
2020-06-04 14:39:20 +02:00
|
|
|
pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0,10)))
|
2020-05-19 09:24:45 +02:00
|
|
|
}
|
|
|
|
pub.setPublisher(asField(paper.Publisher))
|
2020-05-13 10:38:04 +02:00
|
|
|
|
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
if (journal != null && journal.DisplayName.isDefined) {
|
|
|
|
val j = new Journal
|
|
|
|
|
|
|
|
j.setName(journal.DisplayName.get)
|
|
|
|
j.setSp(paper.FirstPage)
|
|
|
|
j.setEp(paper.LastPage)
|
|
|
|
if (journal.Publisher.isDefined)
|
2020-05-22 15:15:09 +02:00
|
|
|
pub.setPublisher(asField(journal.Publisher.get))
|
2020-05-19 09:24:45 +02:00
|
|
|
if (journal.Issn.isDefined)
|
|
|
|
j.setIssnPrinted(journal.Issn.get)
|
2020-06-09 14:32:10 +02:00
|
|
|
j.setVol(paper.Volume)
|
|
|
|
j.setIss(paper.Issue)
|
2020-05-19 09:24:45 +02:00
|
|
|
pub.setJournal(j)
|
|
|
|
}
|
2020-05-20 08:14:03 +02:00
|
|
|
pub.setCollectedfrom(List(createMAGCollectedFrom()).asJava)
|
|
|
|
pub.setDataInfo(generateDataInfo())
|
2020-05-19 09:24:45 +02:00
|
|
|
pub
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def createOAF(inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)): Publication = {
|
|
|
|
|
|
|
|
val paper = inputParams._1._1
|
|
|
|
val authors = inputParams._1._2
|
|
|
|
val description = inputParams._2
|
|
|
|
|
|
|
|
val pub = new Publication
|
2021-06-29 19:07:23 +02:00
|
|
|
pub.setPid(List(createSP(paper.Doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
|
|
|
|
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi).asJava)
|
2020-05-19 09:24:45 +02:00
|
|
|
|
2021-06-29 19:07:23 +02:00
|
|
|
//IMPORTANT
|
|
|
|
//The old method result.setId(generateIdentifier(result, doi))
|
|
|
|
//will be replaced using IdentifierFactory
|
|
|
|
|
|
|
|
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
|
2020-05-19 09:24:45 +02:00
|
|
|
|
2021-03-31 18:33:57 +02:00
|
|
|
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
|
|
|
|
val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
2020-05-19 09:24:45 +02:00
|
|
|
pub.setTitle(List(mainTitles, originalTitles).asJava)
|
|
|
|
|
|
|
|
pub.setSource(List(asField(paper.BookTitle)).asJava)
|
|
|
|
|
|
|
|
|
|
|
|
if (description != null) {
|
|
|
|
pub.setDescription(List(asField(description.IndexedAbstract)).asJava)
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
|
|
|
|
|
|
|
|
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
|
|
|
|
|
|
|
|
a.setFullname(f.author.DisplayName.get)
|
|
|
|
|
|
|
|
if(f.affiliation!= null)
|
|
|
|
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
|
|
|
|
|
|
|
|
2021-03-31 18:33:57 +02:00
|
|
|
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava)
|
2020-05-19 09:24:45 +02:00
|
|
|
|
|
|
|
a
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (paper.Date != null) {
|
2020-06-04 14:39:20 +02:00
|
|
|
pub.setDateofacceptance(asField(paper.Date.toString.substring(0,10)))
|
2020-05-19 09:24:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
pub.setAuthor(authorsOAF.asJava)
|
|
|
|
|
|
|
|
|
|
|
|
pub
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def convertInvertedIndexString(json_input: String): String = {
|
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
|
|
lazy val json: json4s.JValue = parse(json_input)
|
|
|
|
val idl = (json \ "IndexLength").extract[Int]
|
2020-05-13 10:38:04 +02:00
|
|
|
if (idl > 0) {
|
|
|
|
val res = Array.ofDim[String](idl)
|
|
|
|
|
|
|
|
val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]]
|
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
for {(k: String, v: List[Int]) <- iid} {
|
2020-05-13 10:38:04 +02:00
|
|
|
v.foreach(item => res(item) = k)
|
|
|
|
}
|
2020-05-19 09:24:45 +02:00
|
|
|
(0 until idl).foreach(i => {
|
|
|
|
if (res(i) == null)
|
|
|
|
res(i) = ""
|
|
|
|
})
|
2020-05-13 10:38:04 +02:00
|
|
|
return res.mkString(" ")
|
|
|
|
}
|
|
|
|
""
|
|
|
|
}
|
|
|
|
}
|