forked from D-Net/dnet-hadoop
Hosted By Map - modification in the code to prepare the info needed to apply the HostedByMap. There is no need to join datasources with the hbm: all the information needed is in the hosted by map already
This commit is contained in:
parent
1695d45bd4
commit
ff1ce75e33
|
@ -3,7 +3,8 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DatasourceInfo, EntityInfo}
|
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DatasourceInfo, EntityInfo}
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Datasource, Journal, Publication}
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{Journal, Publication}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
@ -12,11 +13,13 @@ import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
object SparkPrepareHostedByInfoToApply {
|
object SparkPrepareHostedByInfoToApply {
|
||||||
|
|
||||||
|
|
||||||
implicit val mapEncoderDSInfo: Encoder[DatasourceInfo] = Encoders.kryo[DatasourceInfo]
|
implicit val mapEncoderDSInfo: Encoder[DatasourceInfo] = Encoders.bean(classOf[DatasourceInfo])
|
||||||
implicit val mapEncoderPInfo: Encoder[EntityInfo] = Encoders.kryo[EntityInfo]
|
implicit val mapEncoderPInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||||
|
|
||||||
def getList(id: String, j: Journal, name: String ) : List[EntityInfo] = {
|
def getList(id: String, j: Journal, name: String ) : List[EntityInfo] = {
|
||||||
var lst:List[EntityInfo] = List()
|
var lst:List[EntityInfo] = List()
|
||||||
|
@ -47,25 +50,12 @@ object SparkPrepareHostedByInfoToApply {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def toEntityInfo(input:String): EntityInfo = {
|
||||||
def prepareDatasourceInfo(spark:SparkSession, datasourcePath:String) : Dataset[DatasourceInfo] = {
|
|
||||||
implicit val mapEncoderDats: Encoder[Datasource] = Encoders.bean(classOf[Datasource])
|
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
|
||||||
|
|
||||||
val dd : Dataset[Datasource] = spark.read.textFile(datasourcePath)
|
|
||||||
.map(r => mapper.readValue(r, classOf[Datasource]))
|
|
||||||
|
|
||||||
dd.filter(d => d.getJournal != null ).map(d => DatasourceInfo.newInstance(d.getId, d.getOfficialname.getValue,
|
|
||||||
d.getJournal.getIssnPrinted, d.getJournal.getIssnOnline, d.getJournal.getIssnLinking))
|
|
||||||
|
|
||||||
}
|
|
||||||
def toHostedByItem(input:String): HostedByItemType = {
|
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
|
||||||
lazy val json: json4s.JValue = parse(input)
|
lazy val json: json4s.JValue = parse(input)
|
||||||
val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]]
|
val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]]
|
||||||
c.values.head
|
toEntityItem(c.keys.head, c.values.head)
|
||||||
}
|
}
|
||||||
|
|
||||||
def explodeJournalInfo(input: DatasourceInfo): List[EntityInfo] = {
|
def explodeJournalInfo(input: DatasourceInfo): List[EntityInfo] = {
|
||||||
|
@ -73,10 +63,49 @@ object SparkPrepareHostedByInfoToApply {
|
||||||
if (input.getEissn != null && !input.getEissn.equals("")){
|
if (input.getEissn != null && !input.getEissn.equals("")){
|
||||||
lst = EntityInfo.newInstance(input.getId, input.getEissn, input.getOfficialname, input.getOpenAccess) :: lst
|
lst = EntityInfo.newInstance(input.getId, input.getEissn, input.getOfficialname, input.getOpenAccess) :: lst
|
||||||
}
|
}
|
||||||
|
if (input.getLissn != null && !input.getLissn.equals("")){
|
||||||
|
lst = EntityInfo.newInstance(input.getId, input.getLissn, input.getOfficialname, input.getOpenAccess) :: lst
|
||||||
|
}
|
||||||
|
if (input.getIssn != null && !input.getIssn.equals("")){
|
||||||
|
lst = EntityInfo.newInstance(input.getId, input.getIssn, input.getOfficialname, input.getOpenAccess) :: lst
|
||||||
|
}
|
||||||
|
|
||||||
lst
|
lst
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def joinDsandHBM(left:Dataset[DatasourceInfo], right:Dataset[HostedByItemType]): Dataset[EntityInfo] = {
|
||||||
|
left.joinWith(right,
|
||||||
|
left.col("id").equalTo(right.col("id")), "left")
|
||||||
|
.map(t2 => {
|
||||||
|
val dsi : DatasourceInfo = t2._1
|
||||||
|
if(t2._2 != null){
|
||||||
|
val hbi : HostedByItemType = t2._2
|
||||||
|
dsi.setOpenAccess(hbi.openAccess)
|
||||||
|
}
|
||||||
|
dsi
|
||||||
|
}).flatMap(explodeJournalInfo)
|
||||||
|
}
|
||||||
|
|
||||||
|
def toEntityItem(journal_id: String , hbi: HostedByItemType): EntityInfo = {
|
||||||
|
|
||||||
|
EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def joinResHBM(res: Dataset[EntityInfo], hbm: Dataset[EntityInfo]): Dataset[EntityInfo] = {
|
||||||
|
Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journal_id").equalTo(hbm.col("journal_id")), "left")
|
||||||
|
.map(t2 => {
|
||||||
|
val res: EntityInfo = t2._1
|
||||||
|
if(t2._2 != null ){
|
||||||
|
val ds = t2._2
|
||||||
|
res.setHb_id(ds.getId)
|
||||||
|
res.setOpenaccess(ds.getOpenaccess)
|
||||||
|
res.setName(ds.getName)
|
||||||
|
}
|
||||||
|
res
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
|
||||||
|
@ -105,43 +134,20 @@ object SparkPrepareHostedByInfoToApply {
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
//STEP1: leggere le DS e creare le entries {dsid, dsofficialname, issn, eissn, lissn, openaccess}
|
|
||||||
val datasourceInfoDataset: Dataset[DatasourceInfo] = prepareDatasourceInfo(spark, "$graphPath/datasource")
|
|
||||||
|
|
||||||
//STEP2: leggere la hostedbymap e raggruppare per datasource id
|
//STEP1: leggere la hostedbymap e trasformarla in entity info
|
||||||
val hostedByDataset = Aggregators.hostedByToSingleDSId(spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(toHostedByItem)))
|
val hostedByInfo:Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
|
||||||
|
|
||||||
//STEP3: eseguire una join fra le datasource e la hostedby map (left) per settare se la datasource e' open access o no
|
//STEP2: creare la mappa publication id issn, eissn, lissn esplosa
|
||||||
//ed esplodere l'info della datasource per ogni journal id diverso da nullo
|
|
||||||
val join : Dataset[EntityInfo] = datasourceInfoDataset.joinWith(hostedByDataset,
|
|
||||||
datasourceInfoDataset.col("id").equalTo(hostedByDataset.col("id"), "left"))
|
|
||||||
.map(t2 => {
|
|
||||||
val dsi : DatasourceInfo = t2._1
|
|
||||||
if(t2._2 != null){
|
|
||||||
dsi.setOpenAccess(t2._2.openAccess)
|
|
||||||
}
|
|
||||||
dsi
|
|
||||||
}).flatMap(explodeJournalInfo)
|
|
||||||
|
|
||||||
//STEP4: creare la mappa publication id issn, eissn, lissn esplosa
|
|
||||||
val resultInfoDataset:Dataset[EntityInfo] = prepareResultInfo(spark, "$graphPath/publication")
|
val resultInfoDataset:Dataset[EntityInfo] = prepareResultInfo(spark, "$graphPath/publication")
|
||||||
|
|
||||||
//STEP5: join di join con resultInfo sul journal_id dal result con left
|
//STEP3: join resultInfo con hostedByInfo sul journal_id dal result con left
|
||||||
// e riduzione di tutti i result con lo stesso id in una unica entry
|
// e riduzione di tutti i result con lo stesso id in una unica entry con aggiunto l'id della datasource
|
||||||
Aggregators.resultToSingleId(resultInfoDataset.joinWith(join, resultInfoDataset.col("journal_id").equalTo(join.col("journal_id")), "left")
|
joinResHBM(resultInfoDataset, hostedByInfo)
|
||||||
.map(t2 => {
|
.write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
|
||||||
val res: EntityInfo = t2._1
|
|
||||||
if(t2._2 != null ){
|
|
||||||
val ds = t2._2
|
|
||||||
res.setHb_id(ds.getId)
|
|
||||||
res.setOpenaccess(ds.getOpenaccess)
|
|
||||||
res.setName(ds.getName)
|
|
||||||
}
|
|
||||||
res
|
|
||||||
})).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue