Merge branch 'beta' into doiboost_url

2021-11-11 10:40:19 +01:00 · 2021-11-11 10:40:19 +01:00 · 148289150f
parent 6d3c4c4abe 2ca0a436ad
commit 148289150f
10 changed files with 349 additions and 1 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala
@ -0,0 +1,107 @@
 package eu.dnetlib.dhp.oa.graph.resolution
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.common.HdfsSupport
 import eu.dnetlib.dhp.schema.common.EntityType
 import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.SparkConf
 import org.apache.spark.sql._
 import org.slf4j.{Logger, LoggerFactory}
 object SparkResolveEntities {
  val mapper = new ObjectMapper()
  val entities = List(EntityType.dataset,EntityType.publication, EntityType.software, EntityType.otherresearchproduct)
  def main(args: Array[String]): Unit = {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
    val graphBasePath = parser.get("graphBasePath")
    log.info(s"graphBasePath  -> $graphBasePath")
    val workingPath = parser.get("workingPath")
    log.info(s"workingPath  -> $workingPath")
    val unresolvedPath = parser.get("unresolvedPath")
    log.info(s"unresolvedPath  -> $unresolvedPath")
    val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
    fs.mkdirs(new Path(workingPath))
    resolveEntities(spark, workingPath, unresolvedPath)
    generateResolvedEntities(spark, workingPath, graphBasePath)
    // TO BE conservative we keep the original entities in the working dir
    // and save the resolved entities on the graphBasePath
    //In future these lines of code should be removed
    entities.foreach {
      e =>
        fs.rename(new Path(s"$graphBasePath/$e"), new Path(s"$workingPath/${e}_old"))
        fs.rename(new Path(s"$workingPath/resolvedGraph/$e"), new Path(s"$graphBasePath/$e"))
    }
 }
 def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = {
    implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
    import spark.implicits._
    val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
    val up: Dataset[(String, Result)] = spark.read.text(unresolvedPath).as[String].map(s => mapper.readValue(s, classOf[Result])).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
    rPid.joinWith(up, rPid("_2").equalTo(up("_1")), "inner").map {
      r =>
        val result = r._2._2
        val dnetId = r._1._1
        result.setId(dnetId)
        result
    }.write.mode(SaveMode.Overwrite).save(s"$workingPath/resolvedEntities")
  }
  def deserializeObject(input:String,  entity:EntityType ) :Result = {
   entity match {
     case EntityType.publication => mapper.readValue(input, classOf[Publication])
     case EntityType.dataset => mapper.readValue(input, classOf[OafDataset])
     case EntityType.software=> mapper.readValue(input, classOf[Software])
     case EntityType.otherresearchproduct=> mapper.readValue(input, classOf[OtherResearchProduct])
   }
  }
 def generateResolvedEntities(spark:SparkSession,  workingPath: String, graphBasePath:String) = {
    implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
    import spark.implicits._
    val re:Dataset[Result] = spark.read.load(s"$workingPath/resolvedEntities").as[Result]
    entities.foreach {
      e =>
        spark.read.text(s"$graphBasePath/$e").as[String]
          .map(s => deserializeObject(s, e))
          .union(re)
          .groupByKey(_.getId)
          .reduceGroups {
            (x, y) =>
              x.mergeFrom(y)
              x
          }.map(_._2)
          .filter(r => r.getClass.getSimpleName.toLowerCase != "result")
          .map(r => mapper.writeValueAsString(r))(Encoders.STRING)
          .write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingPath/resolvedGraph/$e")
    }
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala
@ -128,7 +128,7 @@ object SparkResolveRelation {
    source != null
  }
-  private def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = {
+  def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = {
    import spark.implicits._
    val d: RDD[(String, String)] = spark.sparkContext.textFile(s"$graphPath/*")
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml
@ -4,6 +4,10 @@
            <name>graphBasePath</name>
            <description>the path of the graph</description>
        </property>
        <property>
            <name>unresolvedPath</name>
            <description>the path of the unresolved Entities</description>
        </property>
    </parameters>
    <start to="ResolveRelations"/>
@ -36,5 +40,33 @@
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <action name="ResolveEntities">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Resolve Relations in raw graph</name>
            <class>eu.dnetlib.dhp.oa.graph.resolution.SparkResolveEntities</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=10000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--master</arg><arg>yarn</arg>
            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
            <arg>--unresolvedPath</arg><arg>${unresolvedPath}</arg>
            <arg>--workingPath</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json
@ -0,0 +1,6 @@
 [
  {"paramName":"mt",  "paramLongName":"master",       "paramDescription": "should be local or yarn",    "paramRequired": true},
  {"paramName":"w",   "paramLongName":"workingPath",  "paramDescription": "the source Path",            "paramRequired": true},
  {"paramName":"u",   "paramLongName":"unresolvedPath",  "paramDescription": "the source Path",            "paramRequired": true},
  {"paramName":"g",   "paramLongName":"graphBasePath",   "paramDescription": "the path of the raw graph",  "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala
@ -0,0 +1,190 @@
 package eu.dnetlib.dhp.oa.graph.resolution
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.common.EntityType
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
 import eu.dnetlib.dhp.schema.oaf.{Result, StructuredProperty}
 import org.apache.commons.io.FileUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.sql._
 import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.TestInstance.Lifecycle
 import org.junit.jupiter.api.{AfterAll, BeforeAll, Test, TestInstance}
 import java.nio.file.{Files, Path}
 import scala.collection.JavaConverters._
 import scala.io.Source
@TestInstance(Lifecycle.PER_CLASS)
 class ResolveEntitiesTest extends Serializable {
  var workingDir:Path = null
  val FAKE_TITLE = "FAKETITLE"
  val FAKE_SUBJECT = "FAKESUBJECT"
  var sparkSession:Option[SparkSession] = None
  @BeforeAll
  def setUp() :Unit = {
    workingDir = Files.createTempDirectory(getClass.getSimpleName)
    val conf = new SparkConf()
    sparkSession = Some(SparkSession
      .builder()
      .config(conf)
      .appName(getClass.getSimpleName)
      .master("local[*]").getOrCreate())
    populateDatasets(sparkSession.get)
    generateUpdates(sparkSession.get)
  }
  @AfterAll
  def tearDown():Unit = {
    FileUtils.deleteDirectory(workingDir.toFile)
    sparkSession.get.stop()
  }
  def generateUpdates(spark:SparkSession):Unit = {
    val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
    val pids:List[String] = template.lines.map{id =>
      val r = new Result
      r.setId(id.toLowerCase.trim)
      r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
      r.setTitle(List(OafMapperUtils.structuredProperty(FAKE_TITLE, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
      r
    }.map{r =>
      val mapper = new ObjectMapper()
      mapper.writeValueAsString(r)}.toList
    val sc =spark.sparkContext
    println(sc.parallelize(pids).count())
    spark.createDataset(sc.parallelize(pids))(Encoders.STRING).write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingDir/updates")
    import spark.implicits._
    implicit val resEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
    val ds = spark.read.text(s"$workingDir/updates").as[String].map{s => val mapper = new ObjectMapper()
     mapper.readValue(s, classOf[Result])}.collect()
    assertEquals(4, ds.length)
    ds.foreach{r => assertNotNull(r.getSubject)}
    ds.foreach{r => assertEquals(1,r.getSubject.size())}
    ds.foreach{r => assertNotNull(r.getTitle)}
    ds.foreach{r => assertEquals(1,r.getTitle.size())}
    ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_TITLE,t))
    ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_SUBJECT,t))
    println("generated Updates")
  }
  def populateDatasets(spark:SparkSession):Unit = {
    import spark.implicits._
    val entities =SparkResolveEntities.entities
    entities.foreach{
      e =>
        val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
        spark.createDataset(spark.sparkContext.parallelize(template.lines.toList)).as[String].write.option("compression", "gzip").text(s"$workingDir/graph/$e")
        println(s"Created Dataset $e")
    }
    SparkResolveRelation.extractPidResolvedTableFromJsonRDD(spark, s"$workingDir/graph", s"$workingDir/work")
  }
  @Test
  def testResolution():Unit = {
    val spark:SparkSession = sparkSession.get
    implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
    SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" )
    val ds = spark.read.load(s"$workingDir/work/resolvedEntities").as[Result]
    assertEquals(3, ds.count())
    ds.collect().foreach{
      r =>
      assertTrue(r.getId.startsWith("50"))
    }
  }
  private def structuredPContainsValue(l:java.util.List[StructuredProperty], exptectedValue:String):Boolean = {
    l.asScala.exists(p =>p.getValue!= null && p.getValue.equalsIgnoreCase(exptectedValue))
  }
  @Test
  def testUpdate():Unit = {
    val spark:SparkSession = sparkSession.get
    import spark.implicits._
    implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
    val m = new ObjectMapper()
    SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" )
    SparkResolveEntities.generateResolvedEntities(spark,s"$workingDir/work",s"$workingDir/graph" )
    val pubDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/publication").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
    val t  = pubDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
    val datDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
    val td  = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
    val softDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/software").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.software))
    val ts  = softDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
    val orpDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/otherresearchproduct").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct))
    val to  = orpDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
    assertEquals(0, t)
    assertEquals(2, td)
    assertEquals(1, ts)
    assertEquals(0, to)
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/otherresearchproduct
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/otherresearchproduct
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/publication
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/publication
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/software
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/software
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/updates
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/updates
@ -0,0 +1,4 @@
 unresolved::10.17026/dans-x3z-fsq5::doi
 unresolved::10.17026/dans-xsw-qtnx::doi
 unresolved::10.5281/zenodo.1473694::doi
 unresolved::10.17632/fake::doi