merged again from beta (I hope for the last time)

This commit is contained in:
Sandro La Bruzzo 2024-05-22 11:08:52 +02:00
commit f1fe363b19
10 changed files with 61 additions and 47 deletions

1
.gitignore vendored
View File

@ -27,3 +27,4 @@ spark-warehouse
/**/.factorypath /**/.factorypath
/**/.scalafmt.conf /**/.scalafmt.conf
/.java-version /.java-version
/dhp-shade-package/dependency-reduced-pom.xml

View File

@ -128,12 +128,6 @@
<dependency> <dependency>
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>
<artifactId>cnr-rmi-api</artifactId> <artifactId>cnr-rmi-api</artifactId>
<exclusions>
<exclusion>
<artifactId>log4j</artifactId>
<groupId>log4j</groupId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>

View File

@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val master = parser.get("master") val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master") log.info(s"Creating Spark session: Master: $master")
SparkSession val b = SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(master) if (master != null)
.getOrCreate() b.master(master)
b.getOrCreate()
} }
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = { def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {

View File

@ -31,31 +31,31 @@
<artifactId>dhp-actionmanager</artifactId> <artifactId>dhp-actionmanager</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-aggregation</artifactId> <!-- <artifactId>dhp-aggregation</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-blacklist</artifactId> <!-- <artifactId>dhp-blacklist</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-broker-events</artifactId> <!-- <artifactId>dhp-broker-events</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-dedup-openaire</artifactId> <!-- <artifactId>dhp-dedup-openaire</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-enrichment</artifactId> <!-- <artifactId>dhp-enrichment</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-graph-mapper</artifactId> <artifactId>dhp-graph-mapper</artifactId>

View File

@ -1,5 +1,5 @@
[ [
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": false},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true}, {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true} {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
] ]

View File

@ -1,5 +1,6 @@
package eu.dnetlib.dhp.sx.graph package eu.dnetlib.dhp.sx.graph
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty} import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.sx.scholix.{ import eu.dnetlib.dhp.schema.sx.scholix.{
Scholix, Scholix,
@ -28,6 +29,7 @@ case class RelKeyValue(key: String, value: String) {}
object ScholexplorerUtils { object ScholexplorerUtils {
val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier" val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
val mapper = new ObjectMapper()
case class RelationVocabulary(original: String, inverse: String) {} case class RelationVocabulary(original: String, inverse: String) {}
@ -242,7 +244,7 @@ object ScholexplorerUtils {
s s
} }
def updateTarget(s: Scholix, t: ScholixResource): Scholix = { def updateTarget(s: Scholix, t: ScholixResource): String = {
s.setTarget(t) s.setTarget(t)
val spublishers: Seq[ScholixEntityId] = val spublishers: Seq[ScholixEntityId] =
@ -251,6 +253,6 @@ object ScholexplorerUtils {
if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List() if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
s.setPublisher(mergedPublishers.asJava) s.setPublisher(mergedPublishers.asJava)
s mapper.writeValueAsString(s)
} }
} }

View File

@ -11,7 +11,7 @@ import eu.dnetlib.dhp.schema.oaf.{
Dataset => OafDataset Dataset => OafDataset
} }
import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource} import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
import org.apache.spark.sql.functions.{col, concat, expr, md5} import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StructType
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
@ -89,7 +89,13 @@ class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], lo
.withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))")) .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
.drop("collectedfrom") .drop("collectedfrom")
.withColumnRenamed("cf", "collectedfrom") .withColumnRenamed("cf", "collectedfrom")
.distinct() .groupBy(col("id"))
.agg(
first("source").alias("source"),
first("target").alias("target"),
first("relClass").alias("relClass"),
first("collectedfrom").alias("collectedfrom")
)
bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation") bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
@ -97,7 +103,7 @@ class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], lo
def generateScholix(outputPath: String, spark: SparkSession): Unit = { def generateScholix(outputPath: String, spark: SparkSession): Unit = {
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource]) implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
implicit val scholixEncoder: Encoder[Scholix] = Encoders.bean(classOf[Scholix]) implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
import spark.implicits._ import spark.implicits._
val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo] val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
@ -106,18 +112,19 @@ class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], lo
val scholix_one_verse = relations val scholix_one_verse = relations
.joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner") .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
.map(res => ScholexplorerUtils.generateScholix(res._1, res._2)) .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
val resourceTarget = relations val resourceTarget = relations
.joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner") .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
.map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource]))) .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
scholix_one_verse scholix_one_verse
.joinWith(resourceTarget, scholix_one_verse("identifier") === resourceTarget("_1"), "inner") .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
.map(k => ScholexplorerUtils.updateTarget(k._1, k._2._2)) .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
.write .write
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(s"$outputPath/scholix") .text(s"$outputPath/scholix")
} }
} }

View File

@ -1,17 +1,26 @@
package eu.dnetlib.dhp.sx.graph.scholix package eu.dnetlib.dhp.sx.graph.scholix
import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
import org.apache.spark.sql.SparkSession import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
import org.objenesis.strategy.StdInstantiatorStrategy
class ScholixGenerationTest { class ScholixGenerationTest {
@Test @Test
def generateScholix(): Unit = { def generateScholix(): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate() val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
val app = new SparkCreateScholexplorerDump(null, null, null) val app = new SparkCreateScholexplorerDump(null, null, null)
// app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark) // app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
// app.generateBidirectionalRelations("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark) // app.generateBidirectionalRelations(
// "/home/sandro/Downloads/scholix_sample/",
// "/home/sandro/Downloads/scholix/",
// spark
// )
app.generateScholix("/home/sandro/Downloads/scholix/", spark) app.generateScholix("/home/sandro/Downloads/scholix/", spark)
} }
} }