enrichment steps #38
|
@ -23,7 +23,6 @@ object SparkGenerateDoiBoost {
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master")).getOrCreate()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val crossrefPublicationPath = parser.get("crossrefPublicationPath")
|
val crossrefPublicationPath = parser.get("crossrefPublicationPath")
|
||||||
val crossrefDatasetPath = parser.get("crossrefDatasetPath")
|
val crossrefDatasetPath = parser.get("crossrefDatasetPath")
|
||||||
val uwPublicationPath = parser.get("uwPublicationPath")
|
val uwPublicationPath = parser.get("uwPublicationPath")
|
||||||
|
@ -46,16 +45,38 @@ object SparkGenerateDoiBoost {
|
||||||
logger.info("Phase 2) Join Crossref with UnpayWall")
|
logger.info("Phase 2) Join Crossref with UnpayWall")
|
||||||
|
|
||||||
val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
|
val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
|
||||||
val uwPublication:Dataset[(String,Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p =>(p.getId, p) )
|
val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
|
||||||
|
|
||||||
crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")),"left").map(item => {
|
def applyMerge(item:((String, Publication), (String, Publication))) : Publication =
|
||||||
|
{
|
||||||
val crossrefPub = item._1._2
|
val crossrefPub = item._1._2
|
||||||
val unpayWallPub = item._1._2
|
if (item._2!= null) {
|
||||||
if(unpayWallPub!= null) {
|
val otherPub = item._2._2
|
||||||
crossrefPub.mergeFrom(unpayWallPub)
|
if (otherPub != null) {
|
||||||
|
crossrefPub.mergeFrom(otherPub)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
crossrefPub
|
crossrefPub
|
||||||
}).write.save(s"$workingDirPath/firstJoin")
|
}
|
||||||
|
crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin")
|
||||||
|
logger.info("Phase 3) Join Result with ORCID")
|
||||||
|
val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
|
||||||
|
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
|
||||||
|
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
|
||||||
|
|
||||||
|
logger.info("Phase 3) Join Result with MAG")
|
||||||
|
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
|
||||||
|
|
||||||
|
sj.where(sj("_1").like())
|
||||||
|
|
||||||
|
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
|
||||||
|
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,7 +40,7 @@
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<start to="ResetWorkingPath"/>
|
<start to="CreateDOIBoost"/>
|
||||||
|
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
|
Loading…
Reference in New Issue