2020-05-11 09:38:27 +02:00
|
|
|
package eu.dnetlib.doiboost.mag
|
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.Publication
|
|
|
|
import org.apache.htrace.fasterxml.jackson.databind.SerializationFeature
|
|
|
|
import org.apache.spark.SparkConf
|
|
|
|
import org.apache.spark.api.java.function.MapFunction
|
|
|
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
|
|
|
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
2020-05-11 09:38:27 +02:00
|
|
|
import org.junit.jupiter.api.Test
|
|
|
|
import org.slf4j.{Logger, LoggerFactory}
|
2020-05-13 10:38:04 +02:00
|
|
|
import org.junit.jupiter.api.Assertions._
|
2020-05-19 09:24:45 +02:00
|
|
|
import org.apache.spark.sql.functions._
|
|
|
|
|
|
|
|
import scala.collection.JavaConverters._
|
2020-05-13 10:38:04 +02:00
|
|
|
import scala.io.Source
|
2020-05-19 09:24:45 +02:00
|
|
|
import scala.reflect.ClassTag
|
|
|
|
import scala.util.matching.Regex
|
|
|
|
|
2020-05-11 09:38:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
class MAGMappingTest {
|
|
|
|
|
|
|
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
|
|
|
val mapper = new ObjectMapper()
|
|
|
|
|
|
|
|
|
2020-05-19 09:24:45 +02:00
|
|
|
@Test
|
2020-05-11 09:38:27 +02:00
|
|
|
def testMAGCSV(): Unit = {
|
2020-05-19 09:24:45 +02:00
|
|
|
// SparkPreProcessMAG.main("-m local[*] -s /data/doiboost/mag/datasets -t /data/doiboost/mag/datasets/preprocess".split(" "))
|
|
|
|
|
|
|
|
val sparkConf: SparkConf = new SparkConf
|
|
|
|
|
|
|
|
val spark: SparkSession = SparkSession.builder()
|
|
|
|
.config(sparkConf)
|
|
|
|
.appName(getClass.getSimpleName)
|
|
|
|
.master("local[*]")
|
|
|
|
.getOrCreate()
|
|
|
|
|
|
|
|
import spark.implicits._
|
|
|
|
|
|
|
|
|
|
|
|
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
|
|
|
implicit val longBarEncoder = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
|
|
|
|
|
|
|
val sourcePath = "/data/doiboost/mag/input"
|
|
|
|
|
|
|
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
|
|
|
|
|
|
|
|
|
|
|
val magOAF = spark.read.load("$sourcePath/merge_step_4").as[Publication]
|
|
|
|
|
|
|
|
println(magOAF.first().getOriginalId)
|
|
|
|
|
|
|
|
|
|
|
|
magOAF.map(k => (ConversionUtil.extractMagIdentifier(k.getOriginalId.asScala),k)).as[(String,Publication)].show()
|
|
|
|
|
|
|
|
|
|
|
|
println((ConversionUtil.extractMagIdentifier(magOAF.first().getOriginalId.asScala)))
|
|
|
|
|
|
|
|
val magIDRegex: Regex = "^[0-9]+$".r
|
|
|
|
|
|
|
|
|
|
|
|
println(magIDRegex.findFirstMatchIn("suca").isDefined)
|
|
|
|
|
2020-05-13 10:38:04 +02:00
|
|
|
}
|
|
|
|
|
2020-05-11 09:38:27 +02:00
|
|
|
|
2020-05-13 10:38:04 +02:00
|
|
|
@Test
|
2020-05-19 09:24:45 +02:00
|
|
|
def buildInvertedIndexTest(): Unit = {
|
2020-05-13 10:38:04 +02:00
|
|
|
val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString
|
|
|
|
val description = ConversionUtil.convertInvertedIndexString(json_input)
|
|
|
|
assertNotNull(description)
|
|
|
|
assertTrue(description.nonEmpty)
|
2020-05-11 09:38:27 +02:00
|
|
|
|
2020-05-13 10:38:04 +02:00
|
|
|
logger.debug(description)
|
2020-05-11 09:38:27 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|
2020-05-19 09:24:45 +02:00
|
|
|
|
|
|
|
|