Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
2 changed files with 5 additions and 7 deletions
Showing only changes of commit 3dc48c7ab5 - Show all commits

View File

@ -48,18 +48,16 @@ object CollectionUtils {
List() List()
} }
def saveDataset(d: Dataset[Oaf], targetPath: String):Unit = { def saveDataset(dataset: Dataset[Oaf], targetPath: String): Unit = {
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
val mapper = new ObjectMapper
d dataset
.flatMap(i => CollectionUtils.fixRelations(i)) .flatMap(i => CollectionUtils.fixRelations(i))
.filter(i => i != null) .filter(i => i != null)
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
.write .write
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.text(targetPath) .json(targetPath)
} }
} }

View File

@ -38,7 +38,7 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion]) val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info(s"outputBasePath is '$outputBasePath'") log.info(s"outputBasePath is '$outputBasePath'")
val targetPath = s"$outputBasePath/$MDSTORE_DATA_PATH" val targetPath = s"$outputBasePath$MDSTORE_DATA_PATH"
log.info(s"targetPath is '$targetPath'") log.info(s"targetPath is '$targetPath'")
generateDataciteDataset(sourcePath, exportLinks, vocabularies, targetPath, spark) generateDataciteDataset(sourcePath, exportLinks, vocabularies, targetPath, spark)
@ -54,7 +54,7 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
* @param outputBasePath * @param outputBasePath
*/ */
def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = { def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = {
val total_items = spark.read.load(targetPath).count() val total_items = spark.read.text(targetPath).count()
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH) writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
} }