dnet-hadoop/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostAction...

86 lines
3.8 KiB
Scala
Raw Normal View History

2020-05-26 09:15:33 +02:00
package eu.dnetlib.doiboost
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.action.AtomicAction
2020-06-04 14:39:20 +02:00
import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
2020-05-26 09:15:33 +02:00
import org.apache.commons.io.IOUtils
2020-06-04 14:39:20 +02:00
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
2020-05-26 09:15:33 +02:00
import org.apache.spark.SparkConf
2020-06-04 14:39:20 +02:00
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
2020-05-26 09:15:33 +02:00
import org.slf4j.{Logger, LoggerFactory}
object SparkGenerateDOIBoostActionSet {
val logger: Logger = LoggerFactory.getLogger(getClass)
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
2020-05-28 09:57:46 +02:00
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
2020-06-04 14:39:20 +02:00
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
2020-05-28 09:57:46 +02:00
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
2020-06-04 14:39:20 +02:00
implicit val mapEncoderAS: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
2020-05-26 09:15:33 +02:00
2020-05-28 09:57:46 +02:00
implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
2020-05-26 09:15:33 +02:00
2020-05-28 09:57:46 +02:00
val dbPublicationPath = parser.get("dbPublicationPath")
val dbDatasetPath = parser.get("dbDatasetPath")
val crossRefRelation = parser.get("crossRefRelation")
val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath")
2020-06-04 14:39:20 +02:00
val dbOrganizationPath = parser.get("dbOrganizationPath")
2020-05-28 09:57:46 +02:00
val workingDirPath = parser.get("targetPath")
val sequenceFilePath = parser.get("sFilePath")
2020-05-26 09:15:33 +02:00
val asDataset = spark.read.load(dbDatasetPath).as[OafDataset]
2020-05-28 09:57:46 +02:00
.map(d =>DoiBoostMappingUtil.fixResult(d))
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
// .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
2020-05-26 09:15:33 +02:00
val asPublication =spark.read.load(dbPublicationPath).as[Publication]
2020-05-28 09:57:46 +02:00
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
2020-05-26 09:15:33 +02:00
val asOrganization = spark.read.load(dbOrganizationPath).as[Organization]
2020-06-04 14:39:20 +02:00
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
2020-06-04 14:39:20 +02:00
val asCRelation = spark.read.load(crossRefRelation).as[Relation]
2020-05-28 09:57:46 +02:00
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
2020-05-28 09:57:46 +02:00
val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation]
2020-05-28 09:57:46 +02:00
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
2020-05-28 09:57:46 +02:00
val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation)
// spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
2020-05-26 09:15:33 +02:00
d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
2020-05-26 09:15:33 +02:00
}
}