update function to generete AS as sequence file
This commit is contained in:
parent
ac8747582c
commit
f9e940a2af
|
@ -678,7 +678,7 @@ object MagUtility extends Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def generateOrganization(r: Row): String = {
|
def generateOrganization(r: Row): (String,String) = {
|
||||||
|
|
||||||
val o = new Organization
|
val o = new Organization
|
||||||
val affId = s"20|mag_________::${DHPUtils.md5(r.getAs[Long]("AffiliationId").toString)}"
|
val affId = s"20|mag_________::${DHPUtils.md5(r.getAs[Long]("AffiliationId").toString)}"
|
||||||
|
@ -727,7 +727,7 @@ object MagUtility extends Serializable {
|
||||||
val a = new AtomicAction[Organization]()
|
val a = new AtomicAction[Organization]()
|
||||||
a.setClazz(classOf[Organization])
|
a.setClazz(classOf[Organization])
|
||||||
a.setPayload(o)
|
a.setPayload(o)
|
||||||
mapper.writeValueAsString(a)
|
(a.getClazz.getName,mapper.writeValueAsString(a))
|
||||||
}
|
}
|
||||||
|
|
||||||
def generateAffiliationRelations(paperAffiliation: Row): List[Relation] = {
|
def generateAffiliationRelations(paperAffiliation: Row): List[Relation] = {
|
||||||
|
|
|
@ -3,6 +3,9 @@ package eu.dnetlib.dhp.collection.mag
|
||||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization
|
import eu.dnetlib.dhp.schema.oaf.Organization
|
||||||
|
import org.apache.hadoop.io.Text
|
||||||
|
import org.apache.hadoop.io.compress.{BZip2Codec, GzipCodec}
|
||||||
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
@ -24,9 +27,18 @@ class SparkMagOrganizationAS (propertyPath: String, args: Array[String], log: Lo
|
||||||
def generateAS(spark:SparkSession, magBasePath:String,outputPath:String ):Unit = {
|
def generateAS(spark:SparkSession, magBasePath:String,outputPath:String ):Unit = {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
val organizations = MagUtility.loadMagEntity(spark,"Affiliations", magBasePath)
|
val organizations = MagUtility.loadMagEntity(spark,"Affiliations", magBasePath)
|
||||||
organizations.map(r => MagUtility.generateOrganization(r)).write.mode(SaveMode.Overwrite)
|
organizations
|
||||||
.option("compression", "gzip")
|
.map(r => MagUtility.generateOrganization(r))
|
||||||
.text(outputPath)
|
.rdd
|
||||||
|
.map(s => (new Text(s._1), new Text(s._2)))
|
||||||
|
.filter(s => s!=null)
|
||||||
|
.saveAsHadoopFile(
|
||||||
|
outputPath,
|
||||||
|
classOf[Text],
|
||||||
|
classOf[Text],
|
||||||
|
classOf[SequenceFileOutputFormat[Text, Text]],
|
||||||
|
classOf[BZip2Codec]
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue