62 lines
1.9 KiB
Scala
62 lines
1.9 KiB
Scala
package eu.dnetlib.dhp.collection.crossref
|
|
|
|
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Dataset => OafDataset}
|
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
|
import org.apache.spark.sql._
|
|
import org.slf4j.{Logger, LoggerFactory}
|
|
|
|
class SparkMapDumpIntoOAF(propertyPath: String, args: Array[String], log: Logger)
|
|
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
|
|
|
/** Here all the spark applications runs this method
|
|
* where the whole logic of the spark node is defined
|
|
*/
|
|
override def run(): Unit = {
|
|
val sourcePath = parser.get("sourcePath")
|
|
log.info("sourcePath: {}", sourcePath)
|
|
val targetPath = parser.get("targetPath")
|
|
log.info("targetPath: {}", targetPath)
|
|
val isLookupUrl: String = parser.get("isLookupUrl")
|
|
log.info("isLookupUrl: {}", isLookupUrl)
|
|
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
|
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
|
require(vocabularies != null)
|
|
transformCrossref(spark, sourcePath, targetPath, vocabularies)
|
|
|
|
}
|
|
|
|
def transformCrossref(
|
|
spark: SparkSession,
|
|
sourcePath: String,
|
|
targetPath: String,
|
|
vocabularies: VocabularyGroup
|
|
): Unit = {
|
|
import spark.implicits._
|
|
val dump = spark.read.text(sourcePath).as[String]
|
|
dump
|
|
.flatMap(s => Crossref2Oaf.convert(s, vocabularies))
|
|
.write
|
|
.mode(SaveMode.Overwrite)
|
|
.partitionBy("oafType")
|
|
.option("compression", "gzip")
|
|
.text(targetPath)
|
|
|
|
}
|
|
}
|
|
|
|
object SparkMapDumpIntoOAF {
|
|
|
|
def main(args: Array[String]): Unit = {
|
|
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
|
|
|
new SparkMapDumpIntoOAF(
|
|
log = logger,
|
|
args = args,
|
|
propertyPath = "/eu/dnetlib/dhp/collection/crossref/convert_crossref_dump_to_oaf_params.json"
|
|
).initialize().run()
|
|
}
|
|
|
|
}
|