forked from D-Net/dnet-hadoop
Added Action set generation for the MAG organization
This commit is contained in:
parent
41a42dde64
commit
a5ddd8dfbb
|
@ -0,0 +1,21 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "m",
|
||||||
|
"paramLongName": "master",
|
||||||
|
"paramDescription": "the master name",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "o",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "The as output Path",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ma",
|
||||||
|
"paramLongName": "magBasePath",
|
||||||
|
"paramDescription": "The mag Base path",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
|
@ -1,19 +1,11 @@
|
||||||
package eu.dnetlib.dhp.collection.mag
|
package eu.dnetlib.dhp.collection.mag
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
|
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.{OafMapperUtils, PidType}
|
import eu.dnetlib.dhp.schema.oaf.utils.{OafMapperUtils, PidType}
|
||||||
import eu.dnetlib.dhp.schema.oaf.{
|
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, Journal, Organization, Publication, Relation, Result, Dataset => OafDataset}
|
||||||
Author,
|
|
||||||
DataInfo,
|
|
||||||
Instance,
|
|
||||||
Journal,
|
|
||||||
Publication,
|
|
||||||
Relation,
|
|
||||||
Result,
|
|
||||||
Dataset => OafDataset
|
|
||||||
}
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
import org.apache.spark.sql.types._
|
import org.apache.spark.sql.types._
|
||||||
import org.apache.spark.sql.{Dataset, Row, SparkSession}
|
import org.apache.spark.sql.{Dataset, Row, SparkSession}
|
||||||
|
@ -121,7 +113,7 @@ object MagUtility extends Serializable {
|
||||||
"DateTime" -> DateType
|
"DateTime" -> DateType
|
||||||
)
|
)
|
||||||
|
|
||||||
val stream = Map(
|
val stream: Map[String, (String, Seq[String])] = Map(
|
||||||
"Affiliations" -> Tuple2(
|
"Affiliations" -> Tuple2(
|
||||||
"mag/Affiliations.txt",
|
"mag/Affiliations.txt",
|
||||||
Seq(
|
Seq(
|
||||||
|
@ -351,7 +343,7 @@ object MagUtility extends Serializable {
|
||||||
def getSchema(streamName: String): StructType = {
|
def getSchema(streamName: String): StructType = {
|
||||||
var schema = new StructType()
|
var schema = new StructType()
|
||||||
val d: Seq[String] = stream(streamName)._2
|
val d: Seq[String] = stream(streamName)._2
|
||||||
d.foreach { case t =>
|
d.foreach { t =>
|
||||||
val currentType = t.split(":")
|
val currentType = t.split(":")
|
||||||
val fieldName: String = currentType.head
|
val fieldName: String = currentType.head
|
||||||
var fieldType: String = currentType.last
|
var fieldType: String = currentType.last
|
||||||
|
@ -686,6 +678,58 @@ object MagUtility extends Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def generateOrganization(r: Row): String = {
|
||||||
|
|
||||||
|
val o = new Organization
|
||||||
|
val affId = s"20|mag_________::${DHPUtils.md5(r.getAs[Long]("AffiliationId").toString)}"
|
||||||
|
o.setId(affId)
|
||||||
|
o.setDataInfo(MAGDataInfo)
|
||||||
|
o.setCollectedfrom(List(MAGCollectedFrom).asJava)
|
||||||
|
o.setLegalname(field(r.getAs[String]("DisplayName"), null))
|
||||||
|
val gid = r.getAs[String]("GridId")
|
||||||
|
if (gid != null) {
|
||||||
|
o.setPid(List(
|
||||||
|
structuredProperty(gid, qualifier(
|
||||||
|
PidType.GRID.toString,
|
||||||
|
PidType.GRID.toString,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES
|
||||||
|
),
|
||||||
|
null),
|
||||||
|
structuredProperty(r.getAs[Long]("AffiliationId").toString, qualifier(
|
||||||
|
PidType.mag_id.toString,
|
||||||
|
PidType.mag_id.toString,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES
|
||||||
|
),
|
||||||
|
null)
|
||||||
|
|
||||||
|
).asJava)
|
||||||
|
} else {
|
||||||
|
o.setPid(List(
|
||||||
|
structuredProperty(r.getAs[Long]("AffiliationId").toString, qualifier(
|
||||||
|
PidType.mag_id.toString,
|
||||||
|
PidType.mag_id.toString,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES
|
||||||
|
),
|
||||||
|
null)
|
||||||
|
).asJava)
|
||||||
|
}
|
||||||
|
val c = r.getAs[String]("Iso3166Code")
|
||||||
|
if (c != null)
|
||||||
|
o.setCountry(qualifier(c, c, "dnet:countries", "dnet:countries"))
|
||||||
|
else
|
||||||
|
o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
|
||||||
|
val ws = r.getAs[String]("OfficialPage")
|
||||||
|
if (ws != null)
|
||||||
|
o.setWebsiteurl(field(ws, null))
|
||||||
|
val a = new AtomicAction[Organization]()
|
||||||
|
a.setClazz(classOf[Organization])
|
||||||
|
a.setPayload(o)
|
||||||
|
mapper.writeValueAsString(a)
|
||||||
|
}
|
||||||
|
|
||||||
def generateAffiliationRelations(paperAffiliation: Row): List[Relation] = {
|
def generateAffiliationRelations(paperAffiliation: Row): List[Relation] = {
|
||||||
|
|
||||||
val affId = s"20|mag_________::${DHPUtils.md5(paperAffiliation.getAs[Long]("AffiliationId").toString)}"
|
val affId = s"20|mag_________::${DHPUtils.md5(paperAffiliation.getAs[Long]("AffiliationId").toString)}"
|
||||||
|
|
|
@ -0,0 +1,42 @@
|
||||||
|
package eu.dnetlib.dhp.collection.mag
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||||
|
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Organization
|
||||||
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
class SparkMagOrganizationAS (propertyPath: String, args: Array[String], log: Logger)
|
||||||
|
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||||
|
|
||||||
|
/** Here all the spark applications runs this method
|
||||||
|
* where the whole logic of the spark node is defined
|
||||||
|
*/
|
||||||
|
override def run(): Unit = {
|
||||||
|
val magBasePath:String = parser.get("magBasePath")
|
||||||
|
log.info(s"magBasePath is $magBasePath")
|
||||||
|
val outputPath:String = parser.get("outputPath")
|
||||||
|
log.info(s"outputPath is $outputPath")
|
||||||
|
generateAS(spark,magBasePath, outputPath)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateAS(spark:SparkSession, magBasePath:String,outputPath:String ):Unit = {
|
||||||
|
import spark.implicits._
|
||||||
|
val organizations = MagUtility.loadMagEntity(spark,"Affiliations", magBasePath)
|
||||||
|
organizations.map(r => MagUtility.generateOrganization(r)).write.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.text(outputPath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object SparkMagOrganizationAS{
|
||||||
|
|
||||||
|
val log: Logger = LoggerFactory.getLogger(SparkMagOrganizationAS.getClass)
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
new SparkMagOrganizationAS("/eu/dnetlib/dhp/collection/mag/create_organization_AS.json", args, log)
|
||||||
|
.initialize()
|
||||||
|
.run()
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -10,6 +10,7 @@ class MAGMappingTest {
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
|
||||||
def mappingTest(): Unit = {
|
def mappingTest(): Unit = {
|
||||||
|
|
||||||
val spark = SparkSession
|
val spark = SparkSession
|
||||||
|
@ -18,10 +19,9 @@ class MAGMappingTest {
|
||||||
.master("local[*]")
|
.master("local[*]")
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
|
|
||||||
val s = new SparkMAGtoOAF(null, null, null)
|
val s = new SparkMagOrganizationAS(null, null, null)
|
||||||
|
|
||||||
s.convertMAG(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_oaf")
|
s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
|
||||||
s.generateAffiliations(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_oaf")
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue