minor fix

This commit is contained in:
Sandro La Bruzzo 2023-08-02 12:12:20 +02:00
parent 74fcea66e6
commit edf5a780b8
8 changed files with 78 additions and 65 deletions

View File

@ -124,6 +124,12 @@
<dependency> <dependency>
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>
<artifactId>cnr-rmi-api</artifactId> <artifactId>cnr-rmi-api</artifactId>
<exclusions>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>

View File

@ -141,11 +141,7 @@ object ScholixUtils extends Serializable {
s.setRelationship(inverseRelationShip(scholix.getRelationship)) s.setRelationship(inverseRelationShip(scholix.getRelationship))
s.setSource(scholix.getTarget) s.setSource(scholix.getTarget)
s.setTarget(scholix.getSource) s.setTarget(scholix.getSource)
s.setIdentifier( updateId(s)
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s s
} }
@ -184,6 +180,21 @@ object ScholixUtils extends Serializable {
} else List() } else List()
} }
def updateId(scholix: Scholix): Scholix = {
scholix.setIdentifier(
generateIdentifier(
scholix.getSource.getDnetIdentifier,
scholix.getTarget.getDnetIdentifier,
scholix.getRelationship.getName
)
)
scholix
}
def generateIdentifier(sourceId: String, targetId: String, relation: String): String = {
DHPUtils.md5(s"$sourceId::$relation::$targetId")
}
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = { def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
val s = new Scholix val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate) s.setPublicationDate(scholix.getPublicationDate)
@ -192,11 +203,7 @@ object ScholixUtils extends Serializable {
s.setRelationship(scholix.getRelationship) s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource) s.setSource(scholix.getSource)
s.setTarget(generateScholixResourceFromSummary(target)) s.setTarget(generateScholixResourceFromSummary(target))
s.setIdentifier( updateId(s)
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s s
} }
@ -208,11 +215,7 @@ object ScholixUtils extends Serializable {
s.setRelationship(scholix.getRelationship) s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource) s.setSource(scholix.getSource)
s.setTarget(target) s.setTarget(target)
s.setIdentifier( updateId(s)
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s s
} }

View File

@ -1,11 +1,14 @@
package eu.dnetlib.dhp.actionmanager.project.utils; package eu.dnetlib.dhp.actionmanager.project.utils;
import com.fasterxml.jackson.core.type.TypeReference; import java.io.BufferedWriter;
import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException;
import eu.dnetlib.dhp.actionmanager.project.PrepareProjects; import java.io.OutputStreamWriter;
import eu.dnetlib.dhp.actionmanager.project.utils.model.Project; import java.io.Serializable;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataInputStream;
@ -15,13 +18,12 @@ import org.apache.hadoop.fs.Path;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.BufferedWriter; import com.fasterxml.jackson.core.type.TypeReference;
import java.io.IOException; import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.OutputStreamWriter;
import java.io.Serializable; import eu.dnetlib.dhp.actionmanager.project.PrepareProjects;
import java.nio.charset.StandardCharsets; import eu.dnetlib.dhp.actionmanager.project.utils.model.Project;
import java.util.ArrayList; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import java.util.List;
/** /**
* @author miriam.baglioni * @author miriam.baglioni

View File

@ -26,8 +26,6 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
tmp tmp
} }
def extractAttributes(attrs: MetaData, key: String): String = { def extractAttributes(attrs: MetaData, key: String): String = {
val res = attrs.get(key) val res = attrs.get(key)
@ -68,7 +66,7 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
val next = reader.nextEvent() val next = reader.nextEvent()
if (next.isStartElement) { if (next.isStartElement) {
if(insideChar) { if (insideChar) {
if (sb.nonEmpty) if (sb.nonEmpty)
println(s"got data ${sb.toString.trim}") println(s"got data ${sb.toString.trim}")
insideChar = false insideChar = false
@ -100,8 +98,6 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
sb.append(d.trim) sb.append(d.trim)
} }
// next match { // next match {
// case _ if (next.isStartElement) => // case _ if (next.isStartElement) =>
// val name = next.asStartElement().getName.getLocalPart // val name = next.asStartElement().getName.getLocalPart

View File

@ -5,7 +5,7 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.utils.PidType import eu.dnetlib.dhp.schema.oaf.utils.PidType
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result} import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf, PubmedParser} import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf}
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
@ -19,7 +19,6 @@ import java.util.zip.GZIPInputStream
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.io.Source import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension])) @ExtendWith(Array(classOf[MockitoExtension]))
class BioScholixTest extends AbstractVocabularyTest { class BioScholixTest extends AbstractVocabularyTest {
@ -50,7 +49,6 @@ class BioScholixTest extends AbstractVocabularyTest {
def testEBIData() = { def testEBIData() = {
val inputXML = getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml") val inputXML = getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")
// new PubmedParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz"))) // new PubmedParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
new PMParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz"))) new PMParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
print("DONE") print("DONE")

View File

@ -5,6 +5,6 @@
</property> </property>
<property> <property>
<name>oozie.action.sharelib.for.spark</name> <name>oozie.action.sharelib.for.spark</name>
<value>spark2</value> <value>spark342</value>
</property> </property>
</configuration> </configuration>

View File

@ -10,7 +10,7 @@
</property> </property>
</parameters> </parameters>
<start to="ImportDatasetEntities"/> <start to="CreateScholix"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@ -78,9 +78,10 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.shuffle.partitions=6000
--conf spark.sql.shuffle.partitions=30000 --conf spark.dynamicAllocation.enabled=true
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.shuffle.service.enabled=true
--conf spark.dynamicAllocation.minExecutors=100
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>

View File

@ -64,7 +64,7 @@ object SparkCreateScholix {
.as[ScholixSummary] .as[ScholixSummary]
.map(s => ScholixUtils.generateScholixResourceFromSummary(s)) .map(s => ScholixUtils.generateScholixResourceFromSummary(s))
relationDS val scholixSource: Dataset[Scholix] = relationDS
.joinWith(summaryDS, relationDS("source").equalTo(summaryDS("dnetIdentifier")), "left") .joinWith(summaryDS, relationDS("source").equalTo(summaryDS("dnetIdentifier")), "left")
.map { input: (Relation, ScholixResource) => .map { input: (Relation, ScholixResource) =>
if (input._1 != null && input._2 != null) { if (input._1 != null && input._2 != null) {
@ -76,14 +76,6 @@ object SparkCreateScholix {
} else null } else null
}(scholixEncoder) }(scholixEncoder)
.filter(r => r != null) .filter(r => r != null)
.write
.option("compression", "lz4")
.mode(SaveMode.Overwrite)
.save(s"$targetPath/scholix_from_source")
val scholixSource: Dataset[Scholix] = spark.read
.load(s"$targetPath/scholix_from_source")
.as[Scholix]
scholixSource scholixSource
.joinWith(summaryDS, scholixSource("identifier").equalTo(summaryDS("dnetIdentifier")), "left") .joinWith(summaryDS, scholixSource("identifier").equalTo(summaryDS("dnetIdentifier")), "left")
@ -105,17 +97,32 @@ object SparkCreateScholix {
val scholix_o_v: Dataset[Scholix] = val scholix_o_v: Dataset[Scholix] =
spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix] spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
scholix_o_v def scholix_complete(s: Scholix): Boolean = {
.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))) if (s == null || s.getIdentifier == null) {
.as[Scholix] false
.map(s => (s.getIdentifier, s)) } else if (s.getSource == null || s.getTarget == null) {
.dropDuplicates("identifier") false
.write } else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
.option("compression", "lz4") false
.mode(SaveMode.Overwrite) else
.save(s"$targetPath/scholix") true
}
val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix] val scholix_final: Dataset[Scholix] = scholix_o_v
.filter(s => scholix_complete(s))
.groupByKey(s =>
scala.Ordering.String
.min(s.getSource.getDnetIdentifier, s.getTarget.getDnetIdentifier)
.concat(s.getRelationship.getName)
.concat(scala.Ordering.String.max(s.getSource.getDnetIdentifier, s.getTarget.getDnetIdentifier))
)
.flatMapGroups((id, scholixes) => {
val s = scholixes.toList
if (s.size == 1) Seq(s(0), ScholixUtils.createInverseScholixRelation(s(0)))
else s
})
scholix_final.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix")
val stats: Dataset[(String, String, Long)] = scholix_final val stats: Dataset[(String, String, Long)] = scholix_final
.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)) .map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType))