dnet-hadoop/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/mag/MAGMappingTest.scala

113 lines
3.0 KiB
Scala
Raw Normal View History

package eu.dnetlib.dhp.doiboost.mag
2020-05-11 09:38:27 +02:00
import eu.dnetlib.doiboost.mag.{ConversionUtil, MagPapers, SparkProcessMAG}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, SparkSession}
import org.codehaus.jackson.map.ObjectMapper
import org.json4s.DefaultFormats
import org.junit.jupiter.api.Assertions._
2020-05-11 09:38:27 +02:00
import org.junit.jupiter.api.Test
import org.slf4j.{Logger, LoggerFactory}
import java.sql.Timestamp
import scala.io.Source
2020-05-11 09:38:27 +02:00
class MAGMappingTest {
val logger: Logger = LoggerFactory.getLogger(getClass)
val mapper = new ObjectMapper()
@Test
2022-01-11 16:57:48 +01:00
def testSplitter(): Unit = {
val s = "sports.team"
if (s.contains(".")) {
2022-01-11 16:57:48 +01:00
println(s.split("\\.") head)
}
}
2020-06-04 14:39:20 +02:00
@Test
2022-01-11 16:57:48 +01:00
def testDate(): Unit = {
2020-06-04 14:39:20 +02:00
2022-01-11 16:57:48 +01:00
val p: Timestamp = Timestamp.valueOf("2011-10-02 00:00:00")
2020-06-04 14:39:20 +02:00
2022-01-11 16:57:48 +01:00
println(p.toString.substring(0, 10))
2020-06-04 14:39:20 +02:00
}
@Test
def buildInvertedIndexTest(): Unit = {
2022-01-11 16:57:48 +01:00
val json_input = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json"))
.mkString
val description = ConversionUtil.convertInvertedIndexString(json_input)
assertNotNull(description)
assertTrue(description.nonEmpty)
2020-05-11 09:38:27 +02:00
logger.debug(description)
2020-05-11 09:38:27 +02:00
}
2022-01-11 16:57:48 +01:00
@Test
def normalizeDoiTest(): Unit = {
implicit val formats = DefaultFormats
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
val spark: SparkSession =
SparkSession
.builder()
.appName(getClass.getSimpleName)
.config(conf)
.getOrCreate()
val path = getClass.getResource("/eu/dnetlib/doiboost/mag/magPapers.json").getPath
import org.apache.spark.sql.Encoders
val schema = Encoders.product[MagPapers].schema
import spark.implicits._
2022-01-11 16:57:48 +01:00
val magPapers: Dataset[MagPapers] =
spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers]
val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
assertTrue(ret.count == 10)
ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
spark.close()
}
@Test
2022-01-11 16:57:48 +01:00
def normalizeDoiTest2(): Unit = {
import org.json4s.DefaultFormats
implicit val formats = DefaultFormats
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
val spark: SparkSession =
SparkSession
.builder()
.appName(getClass.getSimpleName)
.config(conf)
.getOrCreate()
val path = getClass.getResource("/eu/dnetlib/doiboost/mag/duplicatedMagPapers.json").getPath
import org.apache.spark.sql.Encoders
val schema = Encoders.product[MagPapers].schema
import spark.implicits._
2022-01-11 16:57:48 +01:00
val magPapers: Dataset[MagPapers] =
spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers]
val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
assertTrue(ret.count == 8)
ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
spark.close()
//ret.take(8).foreach(mp => println(write(mp)))
}
2020-05-11 09:38:27 +02:00
}