minor fix
This commit is contained in:
parent
74fcea66e6
commit
edf5a780b8
|
@ -124,6 +124,12 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib</groupId>
|
<groupId>eu.dnetlib</groupId>
|
||||||
<artifactId>cnr-rmi-api</artifactId>
|
<artifactId>cnr-rmi-api</artifactId>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>log4j</groupId>
|
||||||
|
<artifactId>log4j</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
@ -141,11 +141,7 @@ object ScholixUtils extends Serializable {
|
||||||
s.setRelationship(inverseRelationShip(scholix.getRelationship))
|
s.setRelationship(inverseRelationShip(scholix.getRelationship))
|
||||||
s.setSource(scholix.getTarget)
|
s.setSource(scholix.getTarget)
|
||||||
s.setTarget(scholix.getSource)
|
s.setTarget(scholix.getSource)
|
||||||
s.setIdentifier(
|
updateId(s)
|
||||||
DHPUtils.md5(
|
|
||||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
s
|
s
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -184,6 +180,21 @@ object ScholixUtils extends Serializable {
|
||||||
} else List()
|
} else List()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def updateId(scholix: Scholix): Scholix = {
|
||||||
|
scholix.setIdentifier(
|
||||||
|
generateIdentifier(
|
||||||
|
scholix.getSource.getDnetIdentifier,
|
||||||
|
scholix.getTarget.getDnetIdentifier,
|
||||||
|
scholix.getRelationship.getName
|
||||||
|
)
|
||||||
|
)
|
||||||
|
scholix
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateIdentifier(sourceId: String, targetId: String, relation: String): String = {
|
||||||
|
DHPUtils.md5(s"$sourceId::$relation::$targetId")
|
||||||
|
}
|
||||||
|
|
||||||
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
|
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
|
||||||
val s = new Scholix
|
val s = new Scholix
|
||||||
s.setPublicationDate(scholix.getPublicationDate)
|
s.setPublicationDate(scholix.getPublicationDate)
|
||||||
|
@ -192,11 +203,7 @@ object ScholixUtils extends Serializable {
|
||||||
s.setRelationship(scholix.getRelationship)
|
s.setRelationship(scholix.getRelationship)
|
||||||
s.setSource(scholix.getSource)
|
s.setSource(scholix.getSource)
|
||||||
s.setTarget(generateScholixResourceFromSummary(target))
|
s.setTarget(generateScholixResourceFromSummary(target))
|
||||||
s.setIdentifier(
|
updateId(s)
|
||||||
DHPUtils.md5(
|
|
||||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
s
|
s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -208,11 +215,7 @@ object ScholixUtils extends Serializable {
|
||||||
s.setRelationship(scholix.getRelationship)
|
s.setRelationship(scholix.getRelationship)
|
||||||
s.setSource(scholix.getSource)
|
s.setSource(scholix.getSource)
|
||||||
s.setTarget(target)
|
s.setTarget(target)
|
||||||
s.setIdentifier(
|
updateId(s)
|
||||||
DHPUtils.md5(
|
|
||||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
s
|
s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.project.utils;
|
package eu.dnetlib.dhp.actionmanager.project.utils;
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.type.TypeReference;
|
import java.io.BufferedWriter;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import java.io.IOException;
|
||||||
import eu.dnetlib.dhp.actionmanager.project.PrepareProjects;
|
import java.io.OutputStreamWriter;
|
||||||
import eu.dnetlib.dhp.actionmanager.project.utils.model.Project;
|
import java.io.Serializable;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
@ -15,13 +18,12 @@ import org.apache.hadoop.fs.Path;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.BufferedWriter;
|
import com.fasterxml.jackson.core.type.TypeReference;
|
||||||
import java.io.IOException;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import java.io.OutputStreamWriter;
|
|
||||||
import java.io.Serializable;
|
import eu.dnetlib.dhp.actionmanager.project.PrepareProjects;
|
||||||
import java.nio.charset.StandardCharsets;
|
import eu.dnetlib.dhp.actionmanager.project.utils.model.Project;
|
||||||
import java.util.ArrayList;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author miriam.baglioni
|
* @author miriam.baglioni
|
||||||
|
|
|
@ -26,8 +26,6 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
|
||||||
tmp
|
tmp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def extractAttributes(attrs: MetaData, key: String): String = {
|
def extractAttributes(attrs: MetaData, key: String): String = {
|
||||||
|
|
||||||
val res = attrs.get(key)
|
val res = attrs.get(key)
|
||||||
|
@ -68,7 +66,7 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
|
||||||
val next = reader.nextEvent()
|
val next = reader.nextEvent()
|
||||||
|
|
||||||
if (next.isStartElement) {
|
if (next.isStartElement) {
|
||||||
if(insideChar) {
|
if (insideChar) {
|
||||||
if (sb.nonEmpty)
|
if (sb.nonEmpty)
|
||||||
println(s"got data ${sb.toString.trim}")
|
println(s"got data ${sb.toString.trim}")
|
||||||
insideChar = false
|
insideChar = false
|
||||||
|
@ -100,8 +98,6 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
|
||||||
sb.append(d.trim)
|
sb.append(d.trim)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// next match {
|
// next match {
|
||||||
// case _ if (next.isStartElement) =>
|
// case _ if (next.isStartElement) =>
|
||||||
// val name = next.asStartElement().getName.getLocalPart
|
// val name = next.asStartElement().getName.getLocalPart
|
||||||
|
|
|
@ -5,7 +5,7 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf, PubmedParser}
|
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf}
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
@ -19,7 +19,6 @@ import java.util.zip.GZIPInputStream
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
|
||||||
|
|
||||||
@ExtendWith(Array(classOf[MockitoExtension]))
|
@ExtendWith(Array(classOf[MockitoExtension]))
|
||||||
class BioScholixTest extends AbstractVocabularyTest {
|
class BioScholixTest extends AbstractVocabularyTest {
|
||||||
|
|
||||||
|
@ -50,7 +49,6 @@ class BioScholixTest extends AbstractVocabularyTest {
|
||||||
def testEBIData() = {
|
def testEBIData() = {
|
||||||
val inputXML = getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")
|
val inputXML = getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")
|
||||||
|
|
||||||
|
|
||||||
// new PubmedParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
|
// new PubmedParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
|
||||||
new PMParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
|
new PMParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
|
||||||
print("DONE")
|
print("DONE")
|
||||||
|
|
|
@ -5,6 +5,6 @@
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
<value>spark2</value>
|
<value>spark342</value>
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
|
@ -10,7 +10,7 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ImportDatasetEntities"/>
|
<start to="CreateScholix"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
@ -78,9 +78,10 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.sql.shuffle.partitions=6000
|
||||||
--conf spark.sql.shuffle.partitions=30000
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.shuffle.service.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.minExecutors=100
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
|
|
|
@ -64,7 +64,7 @@ object SparkCreateScholix {
|
||||||
.as[ScholixSummary]
|
.as[ScholixSummary]
|
||||||
.map(s => ScholixUtils.generateScholixResourceFromSummary(s))
|
.map(s => ScholixUtils.generateScholixResourceFromSummary(s))
|
||||||
|
|
||||||
relationDS
|
val scholixSource: Dataset[Scholix] = relationDS
|
||||||
.joinWith(summaryDS, relationDS("source").equalTo(summaryDS("dnetIdentifier")), "left")
|
.joinWith(summaryDS, relationDS("source").equalTo(summaryDS("dnetIdentifier")), "left")
|
||||||
.map { input: (Relation, ScholixResource) =>
|
.map { input: (Relation, ScholixResource) =>
|
||||||
if (input._1 != null && input._2 != null) {
|
if (input._1 != null && input._2 != null) {
|
||||||
|
@ -76,14 +76,6 @@ object SparkCreateScholix {
|
||||||
} else null
|
} else null
|
||||||
}(scholixEncoder)
|
}(scholixEncoder)
|
||||||
.filter(r => r != null)
|
.filter(r => r != null)
|
||||||
.write
|
|
||||||
.option("compression", "lz4")
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.save(s"$targetPath/scholix_from_source")
|
|
||||||
|
|
||||||
val scholixSource: Dataset[Scholix] = spark.read
|
|
||||||
.load(s"$targetPath/scholix_from_source")
|
|
||||||
.as[Scholix]
|
|
||||||
|
|
||||||
scholixSource
|
scholixSource
|
||||||
.joinWith(summaryDS, scholixSource("identifier").equalTo(summaryDS("dnetIdentifier")), "left")
|
.joinWith(summaryDS, scholixSource("identifier").equalTo(summaryDS("dnetIdentifier")), "left")
|
||||||
|
@ -105,17 +97,32 @@ object SparkCreateScholix {
|
||||||
val scholix_o_v: Dataset[Scholix] =
|
val scholix_o_v: Dataset[Scholix] =
|
||||||
spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
|
spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
|
||||||
|
|
||||||
scholix_o_v
|
def scholix_complete(s: Scholix): Boolean = {
|
||||||
.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s)))
|
if (s == null || s.getIdentifier == null) {
|
||||||
.as[Scholix]
|
false
|
||||||
.map(s => (s.getIdentifier, s))
|
} else if (s.getSource == null || s.getTarget == null) {
|
||||||
.dropDuplicates("identifier")
|
false
|
||||||
.write
|
} else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
|
||||||
.option("compression", "lz4")
|
false
|
||||||
.mode(SaveMode.Overwrite)
|
else
|
||||||
.save(s"$targetPath/scholix")
|
true
|
||||||
|
}
|
||||||
|
|
||||||
val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix]
|
val scholix_final: Dataset[Scholix] = scholix_o_v
|
||||||
|
.filter(s => scholix_complete(s))
|
||||||
|
.groupByKey(s =>
|
||||||
|
scala.Ordering.String
|
||||||
|
.min(s.getSource.getDnetIdentifier, s.getTarget.getDnetIdentifier)
|
||||||
|
.concat(s.getRelationship.getName)
|
||||||
|
.concat(scala.Ordering.String.max(s.getSource.getDnetIdentifier, s.getTarget.getDnetIdentifier))
|
||||||
|
)
|
||||||
|
.flatMapGroups((id, scholixes) => {
|
||||||
|
val s = scholixes.toList
|
||||||
|
if (s.size == 1) Seq(s(0), ScholixUtils.createInverseScholixRelation(s(0)))
|
||||||
|
else s
|
||||||
|
})
|
||||||
|
|
||||||
|
scholix_final.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix")
|
||||||
|
|
||||||
val stats: Dataset[(String, String, Long)] = scholix_final
|
val stats: Dataset[(String, String, Long)] = scholix_final
|
||||||
.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType))
|
.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType))
|
||||||
|
|
Loading…
Reference in New Issue