dnet-hadoop/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostebymap/SparkPrepareHostedByMapData...

159 lines
6.0 KiB
Scala

package eu.dnetlib.dhp.oa.graph.hostebymap
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.graph.hostebymap.model.{DOAJModel, UnibiGoldModel}
import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Datasource, Organization, Publication, Relation}
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.json4s.DefaultFormats
import org.slf4j.{Logger, LoggerFactory}
import org.json4s.jackson.Serialization.write
import scala.collection.mutable.ListBuffer
object SparkPrepareHostedByMapData {
case class HostedByInfo(id: Option[String], officialname: String, journal_id: String, provenance : String, id_type: String) {}
implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] = Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
implicit val mapEncoderDats: Encoder[Datasource] = Encoders.bean(classOf[Datasource])
implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel]
implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel]
implicit val mapEncoderHBI: Encoder[HostedByInfo] = Encoders.product[HostedByInfo]
def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)) : HostedByItemType = {
val openaire: HostedByInfo = input._1._1
val doaj: HostedByInfo = input._1._2
val gold: HostedByInfo = input._2
val isOpenAccess: Boolean = doaj == null && gold == null
openaire.journal_id match {
case Constants.ISSN => return HostedByItemType(openaire.id.get, openaire.officialname, openaire.journal_id, "", "", isOpenAccess)
case Constants.EISSN => return HostedByItemType(openaire.id.get, openaire.officialname, "", openaire.journal_id, "", isOpenAccess)
case Constants.ISSNL => return HostedByItemType(openaire.id.get, openaire.officialname, "", "", openaire.journal_id, isOpenAccess)
// catch the default with a variable so you can print it
case whoa => return null
}
}
def toHostedByMap(input: HostedByItemType): ListBuffer[String] = {
implicit val formats = DefaultFormats
val serializedJSON:String = write(input)
var hostedBy = new ListBuffer[String]()
if(!input.issn.equals("")){
hostedBy += "{\"" + input.issn + "\":" + serializedJSON + "}"
}
if(!input.eissn.equals("")){
hostedBy += "{\"" + input.eissn + "\":" + serializedJSON + "}"
}
if(!input.lissn.equals("")){
hostedBy += "{\"" + input.lissn + "\":" + serializedJSON + "}"
}
hostedBy
}
def readOADataset(input:String, spark: SparkSession): Dataset[HostedByInfo] = {
spark.read.textFile(input).as[Datasource].flatMap(ds => {
val lst = new ListBuffer[HostedByInfo]()
if (ds.getJournal == null) {
return null
}
val issn: String = ds.getJournal.getIssnPrinted
val issnl: String = ds.getJournal.getIssnOnline
val eissn: String = ds.getJournal.getIssnOnline
val id: String = ds.getId
val officialname: String = ds.getOfficialname.getValue
if (issn != null) {
lst += HostedByInfo(Some(id), officialname, issn, Constants.OPENAIRE, Constants.ISSN)
}
if (issnl != null) {
lst += HostedByInfo(Some(id), officialname, issnl, Constants.OPENAIRE, Constants.ISSNL)
}
if (eissn != null) {
lst += HostedByInfo(Some(id), officialname, eissn, Constants.OPENAIRE, Constants.EISSN)
}
lst
}).filter(i => i != null)
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedby/prepare_hostedby_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
import spark.implicits._
val datasourcePath = parser.get("datasourcePath")
val workingDirPath = parser.get("workingPath")
logger.info("Getting the Datasources")
val doajDataset: Dataset[DOAJModel] = spark.read.load(workingDirPath + "/doaj").as[DOAJModel]
val unibiDataset: Dataset[UnibiGoldModel] = spark.read.load(datasourcePath).as[UnibiGoldModel]
val oa: Dataset[HostedByInfo] = readOADataset(datasourcePath, spark)
val doaj: Dataset[HostedByInfo] = doajDataset.flatMap(doaj => {
val lst = new ListBuffer[HostedByInfo]()
val issn: String = doaj.getIssn
val eissn: String = doaj.getEissn
val officialname: String = doaj.getJournalTitle
if (issn != null) {
lst += HostedByInfo(null, officialname, issn, Constants.DOAJ, Constants.ISSN)
}
if (eissn != null) {
lst += HostedByInfo(null, officialname, eissn, Constants.DOAJ, Constants.EISSN)
}
lst
})
val gold: Dataset[HostedByInfo] = unibiDataset.flatMap(gold => {
val lst = new ListBuffer[HostedByInfo]()
val issn: String = gold.getIssn
val issnl: String = gold.getIssn_l
val officialname: String = gold.getTitle
if (issn != null) {
lst += HostedByInfo(null, officialname, issn, Constants.UNIBI, Constants.ISSN)
}
if (issnl != null) {
lst += HostedByInfo(null, officialname, issnl, Constants.UNIBI, Constants.ISSNL)
}
lst
})
Aggregators.createHostedByItemTypes(oa.joinWith(doaj, oa.col("journal_id").equalTo(doaj.col("journal_id")), "left")
.joinWith(gold, $"_1.col('journal_id')".equalTo(gold.col("journal_id")), "left").map(toHostedByItemType)
.filter(i => i != null))
.flatMap(toHostedByMap)
// .map(i => (i.id,i))
// .groupByKey(_._1)
// .agg(hostedByAggregator.toColumn)
// .map(p => p._2)
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/HostedByMap")
}
}