forked from D-Net/dnet-hadoop
next step of MAG conversion implemented
This commit is contained in:
parent
934ad570e0
commit
b771d67e9d
|
@ -21,11 +21,15 @@ object DoiBoostMappingUtil {
|
||||||
|
|
||||||
|
|
||||||
def generateDataInfo(): DataInfo = {
|
def generateDataInfo(): DataInfo = {
|
||||||
|
generateDataInfo("0.9")
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateDataInfo(trust:String): DataInfo = {
|
||||||
val di = new DataInfo
|
val di = new DataInfo
|
||||||
di.setDeletedbyinference(false)
|
di.setDeletedbyinference(false)
|
||||||
di.setInferred(false)
|
di.setInferred(false)
|
||||||
di.setInvisible(false)
|
di.setInvisible(false)
|
||||||
di.setTrust("0.9")
|
di.setTrust(trust)
|
||||||
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
||||||
di
|
di
|
||||||
}
|
}
|
||||||
|
|
|
@ -140,6 +140,15 @@ case object Crossref2Oaf {
|
||||||
result.setRelevantdate(List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate).filter(p => p != null).asJava)
|
result.setRelevantdate(List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate).filter(p => p != null).asJava)
|
||||||
|
|
||||||
|
|
||||||
|
//Mapping Subject
|
||||||
|
val subjectList:List[String] = (json \ "subject").extractOrElse[List[String]](List())
|
||||||
|
|
||||||
|
if (subjectList.nonEmpty) {
|
||||||
|
result.setSubject(subjectList.map(s=> createSP(s, "keywords", "dnet:subject_classification_typologies")).asJava)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//Mapping AUthor
|
//Mapping AUthor
|
||||||
val authorList: List[mappingAuthor] = (json \ "author").extractOrElse[List[mappingAuthor]](List())
|
val authorList: List[mappingAuthor] = (json \ "author").extractOrElse[List[mappingAuthor]](List())
|
||||||
result.setAuthor(authorList.map(a => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull)).asJava)
|
result.setAuthor(authorList.map(a => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull)).asJava)
|
||||||
|
|
|
@ -40,6 +40,9 @@ case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option
|
||||||
|
|
||||||
case class MagUrl(PaperId: Long, instances: List[String])
|
case class MagUrl(PaperId: Long, instances: List[String])
|
||||||
|
|
||||||
|
case class MagSubject(FieldOfStudyId:Long, DisplayName:String, MainType:Option[String], Score:Float){}
|
||||||
|
|
||||||
|
case class MagFieldOfStudy(PaperId:Long, subjects:List[MagSubject]) {}
|
||||||
|
|
||||||
case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
|
case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
|
||||||
|
|
||||||
|
@ -135,6 +138,8 @@ case object ConversionUtil {
|
||||||
j.setIssnPrinted(journal.Issn.get)
|
j.setIssnPrinted(journal.Issn.get)
|
||||||
pub.setJournal(j)
|
pub.setJournal(j)
|
||||||
}
|
}
|
||||||
|
pub.setCollectedfrom(List(createMAGCollectedFrom()).asJava)
|
||||||
|
pub.setDataInfo(generateDataInfo())
|
||||||
pub
|
pub
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,20 +1,18 @@
|
||||||
package eu.dnetlib.doiboost.mag
|
package eu.dnetlib.doiboost.mag
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
import eu.dnetlib.dhp.schema.oaf.{Publication, StructuredProperty}
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil.asField
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{asField, createSP}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
import org.apache.spark.sql.functions._
|
import org.apache.spark.sql.functions._
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
object SparkPreProcessMAG {
|
object SparkPreProcessMAG {
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
|
@ -31,110 +29,138 @@ object SparkPreProcessMAG {
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
||||||
implicit val tupleForJoinEncoder = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||||
|
|
||||||
|
// logger.info("Phase 1) make uninque DOI in Papers:")
|
||||||
|
// val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers]
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// // Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
|
||||||
|
// val result: RDD[MagPapers] = d.where(col("Doi").isNotNull).rdd.map { p: MagPapers => Tuple2(p.Doi, p) }.reduceByKey { case (p1: MagPapers, p2: MagPapers) =>
|
||||||
|
// var r = if (p1 == null) p2 else p1
|
||||||
|
// if (p1 != null && p2 != null) {
|
||||||
|
// if (p1.CreatedDate != null && p2.CreatedDate != null) {
|
||||||
|
// if (p1.CreatedDate.before(p2.CreatedDate))
|
||||||
|
// r = p1
|
||||||
|
// else
|
||||||
|
// r = p2
|
||||||
|
// } else {
|
||||||
|
// r = if (p1.CreatedDate == null) p2 else p1
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// r
|
||||||
|
// }.map(_._2)
|
||||||
|
//
|
||||||
|
// val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
|
||||||
|
// distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
|
||||||
|
// logger.info(s"Total number of element: ${result.count()}")
|
||||||
|
//
|
||||||
|
// logger.info("Phase 3) Group Author by PaperId")
|
||||||
|
// val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
||||||
|
//
|
||||||
|
// val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
|
||||||
|
//
|
||||||
|
// val paperAuthorAffiliation = spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
|
||||||
|
// .map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) => (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null)) }
|
||||||
|
// .joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
|
||||||
|
// .map(s => {
|
||||||
|
// val mpa = s._1._2
|
||||||
|
// val af = s._2
|
||||||
|
// if (af != null) {
|
||||||
|
// MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName)
|
||||||
|
// } else
|
||||||
|
// mpa
|
||||||
|
// }).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
|
||||||
|
// .write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors")
|
||||||
|
//
|
||||||
|
// logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
|
||||||
|
//
|
||||||
|
// val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
|
||||||
|
//
|
||||||
|
// val papers = spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers]
|
||||||
|
//
|
||||||
|
// val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
||||||
|
//
|
||||||
|
// val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
|
||||||
|
// firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
|
||||||
|
// .map { a: ((MagPapers, MagJournal), MagPaperWithAuthorList) => ConversionUtil.createOAFFromJournalAuthorPaper(a) }.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2")
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// var magPubs: Dataset[(String, Publication)] = spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication].map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||||
|
//
|
||||||
|
// val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// logger.info("Phase 5) enrich publication with URL and Instances")
|
||||||
|
//
|
||||||
|
// magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
|
||||||
|
// .map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
|
||||||
|
// .write.mode(SaveMode.Overwrite)
|
||||||
|
// .save(s"${parser.get("targetPath")}/merge_step_3")
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// logger.info("Phase 6) Enrich Publication with description")
|
||||||
|
// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||||
|
// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||||
|
//
|
||||||
|
// val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication].map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||||
|
//
|
||||||
|
// magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left").map(p => {
|
||||||
|
// val pub = p._1._2
|
||||||
|
// val abst = p._2
|
||||||
|
// if (abst != null) {
|
||||||
|
// pub.setDescription(List(asField(abst.IndexedAbstract)).asJava)
|
||||||
|
// }
|
||||||
|
// pub
|
||||||
|
// }
|
||||||
|
// ).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4")
|
||||||
|
//
|
||||||
|
|
||||||
|
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
|
||||||
|
|
||||||
|
val magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_4").as[Publication].map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||||
|
|
||||||
|
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
|
||||||
|
|
||||||
|
val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy")
|
||||||
|
|
||||||
|
val paperField = pfos.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
|
||||||
|
.select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score")
|
||||||
|
.groupBy($"PaperId").agg(collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score")).as("subjects"))
|
||||||
|
.as[MagFieldOfStudy]
|
||||||
|
|
||||||
|
|
||||||
|
magPubs.joinWith(paperField, col("_1").equalTo(paperField("PaperId")), "left").
|
||||||
logger.info("Phase 1) make uninque DOI in Papers:")
|
map(item => {
|
||||||
|
val publication = item._1._2
|
||||||
val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers]
|
val fieldOfStudy = item._2
|
||||||
|
if (fieldOfStudy != null && fieldOfStudy.subjects != null && fieldOfStudy.subjects.nonEmpty) {
|
||||||
|
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
|
||||||
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
|
val s1 = createSP(s.DisplayName, "keywords", "dnet:subject_classification_typologies")
|
||||||
val result: RDD[MagPapers] = d.where(col("Doi").isNotNull).rdd.map { p: MagPapers => Tuple2(p.Doi, p) }.reduceByKey { case (p1: MagPapers, p2: MagPapers) =>
|
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
|
||||||
var r = if (p1 == null) p2 else p1
|
var resList: List[StructuredProperty] = List(s1)
|
||||||
if (p1 != null && p2 != null) {
|
if (s.MainType.isDefined) {
|
||||||
if (p1.CreatedDate != null && p2.CreatedDate != null) {
|
val maintp = s.MainType.get
|
||||||
if (p1.CreatedDate.before(p2.CreatedDate))
|
val s2 = createSP(s.MainType.get, "keywords", "dnet:subject_classification_typologies")
|
||||||
r = p1
|
s2.setDataInfo(di)
|
||||||
else
|
resList = resList ::: List(s2)
|
||||||
r = p2
|
if (maintp.contains(".")) {
|
||||||
} else {
|
val s3 = createSP(maintp.split("\\.").head, "keywords", "dnet:subject_classification_typologies")
|
||||||
r = if (p1.CreatedDate == null) p2 else p1
|
s3.setDataInfo(di)
|
||||||
|
resList = resList ::: List(s3)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
r
|
resList
|
||||||
}.map(_._2)
|
})
|
||||||
|
publication.setSubject(p.asJava)
|
||||||
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
|
|
||||||
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
|
|
||||||
logger.info(s"Total number of element: ${result.count()}")
|
|
||||||
|
|
||||||
logger.info("Phase 3) Group Author by PaperId")
|
|
||||||
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
|
||||||
|
|
||||||
val affiliation =spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
|
|
||||||
|
|
||||||
val paperAuthorAffiliation =spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
|
|
||||||
|
|
||||||
|
|
||||||
paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
|
|
||||||
.map{case (a:MagPaperAuthorAffiliation,b:MagAuthor )=> (a.AffiliationId,MagPaperAuthorDenormalized(a.PaperId, b, null)) }
|
|
||||||
.joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
|
|
||||||
.map(s => {
|
|
||||||
val mpa = s._1._2
|
|
||||||
val af = s._2
|
|
||||||
if (af!= null) {
|
|
||||||
MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName)
|
|
||||||
} else
|
|
||||||
mpa
|
|
||||||
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
|
|
||||||
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
|
|
||||||
|
|
||||||
|
|
||||||
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
|
|
||||||
|
|
||||||
val papers =spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers]
|
|
||||||
|
|
||||||
val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val firstJoin =papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")),"left")
|
|
||||||
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
|
|
||||||
.map { a: ((MagPapers, MagJournal), MagPaperWithAuthorList) => ConversionUtil.createOAFFromJournalAuthorPaper(a) }.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
var magPubs:Dataset[(String,Publication)] = spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication].map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String,Publication)]
|
|
||||||
|
|
||||||
val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
|
|
||||||
|
|
||||||
|
|
||||||
logger.info("Phase 5) enrich publication with URL and Instances")
|
|
||||||
|
|
||||||
magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
|
|
||||||
.map{a:((String,Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2))}
|
|
||||||
.write.mode(SaveMode.Overwrite)
|
|
||||||
.save(s"${parser.get("targetPath")}/merge_step_3")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
logger.info("Phase 6) Enrich Publication with description")
|
|
||||||
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
|
||||||
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
|
||||||
|
|
||||||
val paperAbstract =spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
|
|
||||||
|
|
||||||
|
|
||||||
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication].map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String,Publication)]
|
|
||||||
|
|
||||||
magPubs.joinWith(paperAbstract,col("_1").equalTo(paperAbstract("PaperId")), "left").map(p=>
|
|
||||||
{
|
|
||||||
val pub = p._1._2
|
|
||||||
val abst = p._2
|
|
||||||
if (abst!= null) {
|
|
||||||
pub.setDescription(List(asField(abst.IndexedAbstract)).asJava)
|
|
||||||
}
|
}
|
||||||
pub
|
publication
|
||||||
|
}).map{s:Publication => s}(Encoders.bean(classOf[Publication])).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/mag_publication")
|
||||||
}
|
}
|
||||||
).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4")
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,7 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ResetWorkingPath"/>
|
<start to="PreprocessMag"/>
|
||||||
|
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
|
|
@ -24,47 +24,21 @@ class MAGMappingTest {
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testMAGCSV(): Unit = {
|
def testSplitter():Unit = {
|
||||||
// SparkPreProcessMAG.main("-m local[*] -s /data/doiboost/mag/datasets -t /data/doiboost/mag/datasets/preprocess".split(" "))
|
val s = "sports.team"
|
||||||
|
|
||||||
val sparkConf: SparkConf = new SparkConf
|
|
||||||
|
|
||||||
val spark: SparkSession = SparkSession.builder()
|
|
||||||
.config(sparkConf)
|
|
||||||
.appName(getClass.getSimpleName)
|
|
||||||
.master("local[*]")
|
|
||||||
.getOrCreate()
|
|
||||||
|
|
||||||
import spark.implicits._
|
|
||||||
|
|
||||||
|
|
||||||
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
if (s.contains(".")) {
|
||||||
implicit val longBarEncoder = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
println(s.split("\\.")head)
|
||||||
|
}
|
||||||
val sourcePath = "/data/doiboost/mag/input"
|
|
||||||
|
|
||||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
|
||||||
|
|
||||||
|
|
||||||
val magOAF = spark.read.load("$sourcePath/merge_step_4").as[Publication]
|
|
||||||
|
|
||||||
println(magOAF.first().getOriginalId)
|
|
||||||
|
|
||||||
|
|
||||||
magOAF.map(k => (ConversionUtil.extractMagIdentifier(k.getOriginalId.asScala),k)).as[(String,Publication)].show()
|
|
||||||
|
|
||||||
|
|
||||||
println((ConversionUtil.extractMagIdentifier(magOAF.first().getOriginalId.asScala)))
|
|
||||||
|
|
||||||
val magIDRegex: Regex = "^[0-9]+$".r
|
|
||||||
|
|
||||||
|
|
||||||
println(magIDRegex.findFirstMatchIn("suca").isDefined)
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def buildInvertedIndexTest(): Unit = {
|
def buildInvertedIndexTest(): Unit = {
|
||||||
val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString
|
val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString
|
||||||
|
|
Loading…
Reference in New Issue