2020-05-11 09:38:27 +02:00
|
|
|
package eu.dnetlib.doiboost.mag
|
|
|
|
|
2020-06-04 14:39:20 +02:00
|
|
|
import java.sql.Timestamp
|
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.Publication
|
|
|
|
import org.apache.htrace.fasterxml.jackson.databind.SerializationFeature
|
2021-06-30 12:57:11 +02:00
|
|
|
import org.apache.spark.{SparkConf, SparkContext}
|
2020-05-19 09:24:45 +02:00
|
|
|
import org.apache.spark.api.java.function.MapFunction
|
|
|
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
|
|
|
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
2020-05-11 09:38:27 +02:00
|
|
|
import org.junit.jupiter.api.Test
|
|
|
|
import org.slf4j.{Logger, LoggerFactory}
|
2020-05-13 10:38:04 +02:00
|
|
|
import org.junit.jupiter.api.Assertions._
|
2020-05-19 09:24:45 +02:00
|
|
|
import org.apache.spark.sql.functions._
|
|
|
|
|
|
|
|
import scala.collection.JavaConverters._
|
2020-05-13 10:38:04 +02:00
|
|
|
import scala.io.Source
|
2020-05-19 09:24:45 +02:00
|
|
|
import scala.reflect.ClassTag
|
|
|
|
import scala.util.matching.Regex
|
|
|
|
|
2020-05-11 09:38:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
class MAGMappingTest {
|
|
|
|
|
|
|
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
|
|
|
val mapper = new ObjectMapper()
|
|
|
|
|
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
|
|
|
|
|
2020-05-20 08:14:03 +02:00
|
|
|
@Test
|
|
|
|
def testSplitter():Unit = {
|
|
|
|
val s = "sports.team"
|
2020-05-19 09:24:45 +02:00
|
|
|
|
|
|
|
|
2020-05-20 08:14:03 +02:00
|
|
|
if (s.contains(".")) {
|
|
|
|
println(s.split("\\.")head)
|
|
|
|
}
|
2020-05-19 09:24:45 +02:00
|
|
|
|
2020-05-13 10:38:04 +02:00
|
|
|
}
|
|
|
|
|
2020-05-11 09:38:27 +02:00
|
|
|
|
2020-05-20 08:14:03 +02:00
|
|
|
|
2020-06-04 14:39:20 +02:00
|
|
|
@Test
|
|
|
|
def testDate() :Unit = {
|
|
|
|
|
|
|
|
val p:Timestamp = Timestamp.valueOf("2011-10-02 00:00:00")
|
|
|
|
|
|
|
|
println(p.toString.substring(0,10))
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2020-05-13 10:38:04 +02:00
|
|
|
@Test
|
2020-05-19 09:24:45 +02:00
|
|
|
def buildInvertedIndexTest(): Unit = {
|
2020-05-13 10:38:04 +02:00
|
|
|
val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString
|
|
|
|
val description = ConversionUtil.convertInvertedIndexString(json_input)
|
|
|
|
assertNotNull(description)
|
|
|
|
assertTrue(description.nonEmpty)
|
2020-05-11 09:38:27 +02:00
|
|
|
|
2020-05-13 10:38:04 +02:00
|
|
|
logger.debug(description)
|
2020-05-11 09:38:27 +02:00
|
|
|
|
|
|
|
}
|
2021-06-30 12:57:11 +02:00
|
|
|
@Test
|
|
|
|
def normalizeDoiTest():Unit = {
|
|
|
|
|
|
|
|
import org.json4s.jackson.Serialization.write
|
|
|
|
import org.json4s.DefaultFormats
|
|
|
|
|
|
|
|
implicit val formats = DefaultFormats
|
|
|
|
|
|
|
|
val conf = new SparkConf().setAppName("test").setMaster("local[2]")
|
|
|
|
val sc = new SparkContext(conf)
|
|
|
|
val spark = SparkSession.builder.config(sc.getConf).getOrCreate()
|
|
|
|
val path = getClass.getResource("magPapers.json").getPath
|
|
|
|
|
|
|
|
import org.apache.spark.sql.Encoders
|
|
|
|
val schema = Encoders.product[MagPapers].schema
|
|
|
|
|
|
|
|
import spark.implicits._
|
|
|
|
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
|
|
|
|
val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
|
|
|
|
assertTrue(ret.count == 10)
|
|
|
|
ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
|
|
|
|
|
|
|
|
spark.close()
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
def normalizeDoiTest2():Unit = {
|
|
|
|
|
|
|
|
import org.json4s.jackson.Serialization.write
|
|
|
|
import org.json4s.DefaultFormats
|
|
|
|
|
|
|
|
implicit val formats = DefaultFormats
|
|
|
|
|
|
|
|
val conf = new SparkConf().setAppName("test").setMaster("local[2]")
|
|
|
|
val sc = new SparkContext(conf)
|
|
|
|
val spark = SparkSession.builder.config(sc.getConf).getOrCreate()
|
|
|
|
val path = getClass.getResource("duplicatedMagPapers.json").getPath
|
|
|
|
|
|
|
|
import org.apache.spark.sql.Encoders
|
|
|
|
val schema = Encoders.product[MagPapers].schema
|
|
|
|
|
|
|
|
import spark.implicits._
|
|
|
|
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
|
|
|
|
val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
|
|
|
|
assertTrue(ret.count == 8)
|
|
|
|
ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
|
|
|
|
spark.close()
|
|
|
|
//ret.take(8).foreach(mp => println(write(mp)))
|
|
|
|
}
|
2020-05-11 09:38:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
}
|
2020-05-19 09:24:45 +02:00
|
|
|
|
|
|
|
|