dnet-hadoop/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala

334 lines
9.6 KiB
Scala
Raw Normal View History

package eu.dnetlib.doiboost
2020-05-26 09:15:33 +02:00
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.utils.DHPUtils
2020-05-26 09:15:33 +02:00
import org.apache.commons.lang3.StringUtils
import org.codehaus.jackson.map.ObjectMapper
2020-05-23 08:46:49 +02:00
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
2020-05-26 09:15:33 +02:00
import org.slf4j.{Logger, LoggerFactory}
2020-05-23 08:46:49 +02:00
import scala.collection.JavaConverters._
import scala.io.Source
case class HostedByItemType(id: String, officialName: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
2020-05-26 09:15:33 +02:00
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:String){}
object DoiBoostMappingUtil {
2020-05-26 09:15:33 +02:00
val logger: Logger = LoggerFactory.getLogger(getClass)
//STATIC STRING
val MAG = "microsoft"
2020-05-23 08:46:49 +02:00
val MAG_NAME = "Microsoft Academic Graph"
val ORCID = "ORCID"
val CROSSREF = "Crossref"
val UNPAYWALL = "UnpayWall"
val GRID_AC = "grid.ac"
val WIKPEDIA = "wikpedia"
val doiBoostNSPREFIX = "doiboost____"
val OPENAIRE_PREFIX = "openaire____"
val SEPARATOR = "::"
val DNET_LANGUAGES = "dnet:languages"
val PID_TYPES = "dnet:pid_types"
2020-05-23 08:46:49 +02:00
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
2020-05-26 09:15:33 +02:00
def toActionSet(item:Oaf) :(String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: Dataset =>
val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
a.setClazz(classOf[Dataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
2020-05-23 08:46:49 +02:00
def retrieveHostedByMap(): Map[String, HostedByItemType] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
val jsonMap = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/hbMap.json")).mkString
lazy val json: json4s.JValue = parse(jsonMap)
json.extract[Map[String, HostedByItemType]]
}
2020-05-23 08:46:49 +02:00
def retrieveHostedByItem(issn: String, eissn: String, lissn: String, hostedByMap: Map[String, HostedByItemType]): HostedByItemType = {
if (issn != null && issn.nonEmpty && hostedByMap.contains(issn))
return hostedByMap(issn)
2020-05-23 08:46:49 +02:00
if (eissn != null && eissn.nonEmpty && hostedByMap.contains(eissn))
return hostedByMap(eissn)
2020-05-23 08:46:49 +02:00
if (lissn != null && lissn.nonEmpty && hostedByMap.contains(lissn))
return hostedByMap(lissn)
null
}
2020-05-26 09:15:33 +02:00
def generateGridAffiliationId(gridId:String) :String = {
s"10|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
}
2020-05-26 09:15:33 +02:00
def fixResult(result: Dataset) :Dataset = {
val instanceType = result.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
if (instanceType.isDefined) {
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
}
result.getInstance().asScala.foreach(i => {
val hb = new KeyValue
hb.setValue("Unknown Repository")
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
i.setHostedby(hb)
})
result
}
2020-05-23 08:46:49 +02:00
def fixPublication(publication: Publication, hostedByMap: Map[String, HostedByItemType]): Publication = {
2020-05-26 09:15:33 +02:00
val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
val eissn =if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
val lissn =if (publication.getJournal == null) null else publication.getJournal.getIssnLinking
2020-05-26 09:15:33 +02:00
val instanceType = publication.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
if (instanceType.isDefined) {
publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
}
2020-05-23 08:46:49 +02:00
val item = retrieveHostedByItem(issn, eissn, lissn, hostedByMap)
2020-05-26 09:15:33 +02:00
publication.getInstance().asScala.foreach(i => {
val hb = new KeyValue
if (item != null) {
hb.setValue(item.officialName)
hb.setKey(generateDSId(item.id))
if (item.openAccess)
2020-05-23 08:46:49 +02:00
i.setAccessright(createQualifier("Open", "dnet:access_modes"))
2020-05-26 09:15:33 +02:00
publication.setBestaccessright(createQualifier("Open", "dnet:access_modes"))
}
else {
hb.setValue("Unknown Repository")
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
}
i.setHostedby(hb)
})
val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid)
if (ar.nonEmpty) {
if(ar.contains("Open")){
publication.setBestaccessright(createQualifier("Open", "dnet:access_modes"))
}
else {
publication.setBestaccessright(createQualifier(ar.head, "dnet:access_modes"))
}
2020-05-23 08:46:49 +02:00
}
publication
2020-05-26 09:15:33 +02:00
}
2020-05-26 09:15:33 +02:00
def generateDSId(input: String): String = {
2020-05-26 09:15:33 +02:00
val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::")
s"10|${b}::${DHPUtils.md5(a)}"
}
2020-05-26 09:15:33 +02:00
def generateDataInfo(): DataInfo = {
generateDataInfo("0.9")
}
2020-05-26 09:15:33 +02:00
def filterPublication(publication: Publication): Boolean = {
2020-05-26 09:15:33 +02:00
//Case empty publication
if (publication == null)
return false
2020-05-26 09:15:33 +02:00
//Case publication with no title
if (publication.getTitle == null || publication.getTitle.size == 0)
return false
2020-05-26 09:15:33 +02:00
val s = publication.getTitle.asScala.count(p => p.getValue != null
&& p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]"))
2020-05-26 09:15:33 +02:00
if (s == 0)
return false
2020-05-26 09:15:33 +02:00
// fixes #4360 (test publisher)
val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
2020-05-26 09:15:33 +02:00
if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
return false;
}
2020-05-26 09:15:33 +02:00
//Publication with no Author
if (publication.getAuthor == null || publication.getAuthor.size() == 0)
return false
//filter invalid author
val authors = publication.getAuthor.asScala.map(s => {
if (s.getFullname.nonEmpty) {
s.getFullname
}
else
s"${
s.getName
} ${
s.getSurname
}"
})
val c = authors.count(isValidAuthorName)
if (c == 0)
return false
// fixes #4368
if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue))
return false
true
}
2020-05-26 09:15:33 +02:00
def isValidAuthorName(fullName: String): Boolean = {
if (fullName == null || fullName.isEmpty)
return false
if (invalidName.contains(fullName.toLowerCase.trim))
return false
true
}
2020-05-26 09:15:33 +02:00
def generateDataInfo(trust: String): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
di
}
2020-05-26 09:15:33 +02:00
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId, schemeId))
sp.setValue(value)
sp
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
}
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId, schemeId))
sp.setValue(value)
sp.setDataInfo(dataInfo)
sp
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
}
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
def createCrossrefCollectedFrom(): KeyValue = {
2020-05-26 09:15:33 +02:00
val cf = new KeyValue
cf.setValue(CROSSREF)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(CROSSREF.toLowerCase))
cf
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
}
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
def createUnpayWallCollectedFrom(): KeyValue = {
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
val cf = new KeyValue
cf.setValue(UNPAYWALL)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(UNPAYWALL.toLowerCase))
cf
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
}
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
def createORIDCollectedFrom(): KeyValue = {
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
val cf = new KeyValue
cf.setValue(ORCID)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ORCID.toLowerCase))
cf
}
2020-05-23 08:46:49 +02:00
def generateIdentifier (oaf: Result, doi: String): String = {
2020-05-26 09:15:33 +02:00
val id = DHPUtils.md5 (doi.toLowerCase)
if (oaf.isInstanceOf[Dataset] )
return s"60|${
doiBoostNSPREFIX
}${
SEPARATOR
}${
id
}"
s"50|${
doiBoostNSPREFIX
}${
SEPARATOR
}${
id
}"
}
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
def createMAGCollectedFrom(): KeyValue = {
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
val cf = new KeyValue
cf.setValue(MAG_NAME)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(MAG))
cf
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
}
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
val q = new Qualifier
q.setClassid(clsName)
q.setClassname(clsValue)
q.setSchemeid(schName)
q.setSchemename(schValue)
q
}
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
def createQualifier(cls: String, sch: String): Qualifier = {
createQualifier(cls, cls, sch, sch)
}
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
def asField[T](value: T): Field[T] = {
val tmp = new Field[T]
tmp.setValue(value)
tmp
2020-05-23 08:46:49 +02:00
2020-05-26 09:15:33 +02:00
}
}