Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
2 changed files with 5 additions and 7 deletions
Showing only changes of commit 3dc48c7ab5 - Show all commits

View File

@ -48,18 +48,16 @@ object CollectionUtils {
List()
}
def saveDataset(d: Dataset[Oaf], targetPath: String):Unit = {
def saveDataset(dataset: Dataset[Oaf], targetPath: String): Unit = {
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
val mapper = new ObjectMapper
d
dataset
.flatMap(i => CollectionUtils.fixRelations(i))
.filter(i => i != null)
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
.write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(targetPath)
.json(targetPath)
}
}

View File

@ -38,7 +38,7 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info(s"outputBasePath is '$outputBasePath'")
val targetPath = s"$outputBasePath/$MDSTORE_DATA_PATH"
val targetPath = s"$outputBasePath$MDSTORE_DATA_PATH"
log.info(s"targetPath is '$targetPath'")
generateDataciteDataset(sourcePath, exportLinks, vocabularies, targetPath, spark)
@ -54,7 +54,7 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
* @param outputBasePath
*/
def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = {
val total_items = spark.read.load(targetPath).count()
val total_items = spark.read.text(targetPath).count()
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
}