forked from D-Net/dnet-hadoop
serialise records in the OAF-store-graph mdstores in json format. Read them again in the graph construction phase using a tolerant parser to support backward compatible changes in the evolution of the schema
This commit is contained in:
parent
f0b523cfa7
commit
9458ee7938
|
@ -1,7 +1,9 @@
|
||||||
package eu.dnetlib.dhp.collection
|
package eu.dnetlib.dhp.collection
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport
|
import eu.dnetlib.dhp.schema.common.ModelSupport
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
|
||||||
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode}
|
||||||
|
|
||||||
object CollectionUtils {
|
object CollectionUtils {
|
||||||
|
|
||||||
|
@ -46,4 +48,15 @@ object CollectionUtils {
|
||||||
List()
|
List()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def saveDataset(d: Dataset[Oaf], targetPath: String):Unit = {
|
||||||
|
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||||
|
val mapper = new ObjectMapper
|
||||||
|
|
||||||
|
d
|
||||||
|
.flatMap(i => CollectionUtils.fixRelations(i))
|
||||||
|
.filter(i => i != null)
|
||||||
|
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
|
||||||
|
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,14 +2,14 @@ package eu.dnetlib.dhp.datacite
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
|
import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
|
import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
|
||||||
|
@ -73,12 +73,12 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
|
||||||
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
|
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
|
||||||
|
|
||||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
spark.read.load(sourcePath).as[DataciteType]
|
CollectionUtils.saveDataset(
|
||||||
.filter(d => d.isActive)
|
spark.read.load(sourcePath).as[DataciteType]
|
||||||
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
|
.filter(d => d.isActive)
|
||||||
.filter(d => d != null)
|
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
|
||||||
.flatMap(i => fixRelations(i)).filter(i => i != null)
|
.filter(d => d != null),
|
||||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
targetPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
package eu.dnetlib.dhp.sx.bio
|
package eu.dnetlib.dhp.sx.bio
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
|
||||||
import BioDBToOAF.ScholixResolved
|
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkTransformBioDatabaseToOAF {
|
object SparkTransformBioDatabaseToOAF {
|
||||||
|
@ -36,13 +36,13 @@ object SparkTransformBioDatabaseToOAF {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
database.toUpperCase() match {
|
database.toUpperCase() match {
|
||||||
case "UNIPROT" =>
|
case "UNIPROT" =>
|
||||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))), targetPath)
|
||||||
case "PDB" =>
|
case "PDB" =>
|
||||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))), targetPath)
|
||||||
case "SCHOLIX" =>
|
case "SCHOLIX" =>
|
||||||
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
CollectionUtils.saveDataset(spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)), targetPath)
|
||||||
case "CROSSREF_LINKS" =>
|
case "CROSSREF_LINKS" =>
|
||||||
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))), targetPath)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
package eu.dnetlib.dhp.sx.bio.ebi
|
package eu.dnetlib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
||||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
|
import eu.dnetlib.dhp.sx.bio.pubmed._
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
||||||
|
@ -177,7 +178,7 @@ object SparkCreateBaselineDataFrame {
|
||||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
||||||
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
||||||
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
||||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||||
|
|
||||||
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
||||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||||
|
@ -192,9 +193,10 @@ object SparkCreateBaselineDataFrame {
|
||||||
}
|
}
|
||||||
|
|
||||||
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
|
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
|
||||||
exported_dataset
|
CollectionUtils.saveDataset(exported_dataset
|
||||||
.map(a => PubMedToOaf.convert(a, vocabularies)).as[Result]
|
.map(a => PubMedToOaf.convert(a, vocabularies)).as[Oaf]
|
||||||
.filter(p => p != null)
|
.filter(p => p != null),
|
||||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
targetPath)
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,10 @@
|
||||||
package eu.dnetlib.dhp.sx.bio.ebi
|
package eu.dnetlib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
||||||
import BioDBToOAF.EBILinkItem
|
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
|
@ -35,10 +34,9 @@ object SparkEBILinksToOaf {
|
||||||
|
|
||||||
val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{"))
|
val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{"))
|
||||||
|
|
||||||
ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
CollectionUtils.saveDataset(ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
||||||
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
||||||
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p))
|
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
|
||||||
.flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null)
|
targetPath)
|
||||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -111,7 +111,7 @@ object PubMedToOaf {
|
||||||
* @param vocabularies the vocabularies
|
* @param vocabularies the vocabularies
|
||||||
* @return The OAF instance if the mapping did not fail
|
* @return The OAF instance if the mapping did not fail
|
||||||
*/
|
*/
|
||||||
def convert(article: PMArticle, vocabularies: VocabularyGroup): Result = {
|
def convert(article: PMArticle, vocabularies: VocabularyGroup): Oaf = {
|
||||||
|
|
||||||
if (article.getPublicationTypes == null)
|
if (article.getPublicationTypes == null)
|
||||||
return null
|
return null
|
||||||
|
|
|
@ -1,18 +1,15 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.raw
|
package eu.dnetlib.dhp.oa.graph.raw
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport
|
import eu.dnetlib.dhp.common.HdfsSupport
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport
|
import eu.dnetlib.dhp.schema.common.ModelSupport
|
||||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
import org.apache.commons.io.IOUtils
|
|
||||||
import org.apache.commons.lang3.StringUtils
|
|
||||||
import org.apache.http.client.methods.HttpGet
|
|
||||||
import org.apache.http.impl.client.HttpClients
|
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.apache.spark.{SparkConf, SparkContext}
|
import org.apache.spark.{SparkConf, SparkContext}
|
||||||
|
import org.json4s.DefaultFormats
|
||||||
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
import org.slf4j.LoggerFactory
|
import org.slf4j.LoggerFactory
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
@ -51,18 +48,21 @@ object CopyHdfsOafSparkApplication {
|
||||||
log.info("hdfsPath: {}", hdfsPath)
|
log.info("hdfsPath: {}", hdfsPath)
|
||||||
|
|
||||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
|
import spark.implicits._
|
||||||
|
|
||||||
val paths = DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala
|
val paths = DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala
|
||||||
|
|
||||||
val validPaths: List[String] = paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList
|
val validPaths: List[String] = paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList
|
||||||
|
|
||||||
if (validPaths.nonEmpty) {
|
if (validPaths.nonEmpty) {
|
||||||
val oaf = spark.read.load(validPaths: _*).as[Oaf]
|
val oaf = spark.read.load(validPaths: _*).as[String]
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
||||||
val l =ModelSupport.oafTypes.entrySet.asScala.map(e => e.getKey).toList
|
val l = ModelSupport.oafTypes.entrySet.asScala.toList
|
||||||
l.foreach(
|
l.foreach(
|
||||||
e =>
|
e =>
|
||||||
oaf.filter(o => o.getClass.getSimpleName.equalsIgnoreCase(e))
|
oaf
|
||||||
|
.filter(o => isOafType(o, e.getKey))
|
||||||
|
.map(j => mapper.readValue(j, e.getValue).asInstanceOf[Oaf])
|
||||||
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
|
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
|
||||||
.write
|
.write
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
|
@ -71,4 +71,20 @@ object CopyHdfsOafSparkApplication {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def isOafType(input: String, oafType: String): Boolean = {
|
||||||
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
lazy val json: org.json4s.JValue = parse(input)
|
||||||
|
if (oafType == "relation") {
|
||||||
|
val hasSource = (json \ "source").extractOrElse[String](null)
|
||||||
|
val hasTarget = (json \ "target").extractOrElse[String](null)
|
||||||
|
|
||||||
|
hasSource != null && hasTarget != null
|
||||||
|
} else {
|
||||||
|
val hasId = (json \ "id").extractOrElse[String](null)
|
||||||
|
val resultType = (json \ "resulttype" \ "classid").extractOrElse[String](null)
|
||||||
|
hasId != null && oafType.equalsIgnoreCase(resultType)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue