dnet-hadoop/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala

212 lines
6.9 KiB
Scala
Raw Normal View History

2021-07-27 12:27:26 +02:00
package eu.dnetlib.dhp.oa.graph.hostedbymap
2021-07-28 10:24:13 +02:00
import eu.dnetlib.dhp.schema.oaf.Datasource
2021-07-27 12:27:26 +02:00
import org.apache.spark.SparkConf
2021-07-28 10:24:13 +02:00
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
2021-07-27 12:27:26 +02:00
import org.json4s.DefaultFormats
2021-07-29 13:04:39 +02:00
import org.json4s.jackson.Serialization.write
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test
2021-07-27 12:27:26 +02:00
2022-01-11 16:57:48 +01:00
class TestPreprocess extends java.io.Serializable {
2021-07-27 12:27:26 +02:00
2021-07-28 10:24:13 +02:00
implicit val mapEncoderDats: Encoder[Datasource] = Encoders.kryo[Datasource]
implicit val schema = Encoders.product[HostedByInfo]
2021-07-27 12:27:26 +02:00
2022-01-11 16:57:48 +01:00
def toHBIString(hbi: HostedByItemType): String = {
2021-07-27 12:27:26 +02:00
implicit val formats = DefaultFormats
2021-07-28 10:24:13 +02:00
2021-07-29 13:04:39 +02:00
write(hbi)
}
2021-07-27 12:27:26 +02:00
2021-07-29 13:04:39 +02:00
@Test
2022-01-11 16:57:48 +01:00
def readDatasource(): Unit = {
2021-07-27 12:27:26 +02:00
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
val spark: SparkSession =
SparkSession
.builder()
.appName(getClass.getSimpleName)
.config(conf)
.getOrCreate()
val path = getClass.getResource("datasource.json").getPath
2022-01-11 16:57:48 +01:00
val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.oaHostedByDataset(spark, path)
2021-07-27 12:27:26 +02:00
2022-01-11 16:57:48 +01:00
assertEquals(9, ds.count)
2021-07-28 10:24:13 +02:00
2021-07-29 13:04:39 +02:00
assertEquals(8, ds.filter(hbi => !hbi.issn.equals("")).count)
assertEquals(5, ds.filter(hbi => !hbi.eissn.equals("")).count)
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
2021-07-28 10:24:13 +02:00
2022-01-11 16:57:48 +01:00
assertEquals(
0,
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
)
2021-07-28 10:24:13 +02:00
2021-07-29 13:04:39 +02:00
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365")).count == 1)
assertTrue(ds.filter(hbi => hbi.eissn.equals("2253-900X")).count == 1)
2022-01-11 16:57:48 +01:00
assertTrue(
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1
)
assertTrue(
2022-01-12 09:40:28 +01:00
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1
2022-01-11 16:57:48 +01:00
)
assertTrue(
ds.filter(hbi =>
hbi.issn.equals("0212-8365") && hbi.id
.equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c")
).count == 1
)
2021-07-29 13:04:39 +02:00
ds.foreach(hbi => assertTrue(hbi.id.startsWith("10|")))
ds.foreach(hbi => println(toHBIString(hbi)))
2021-07-28 10:24:13 +02:00
spark.close()
}
@Test
2022-01-11 16:57:48 +01:00
def readGold(): Unit = {
2021-07-28 10:24:13 +02:00
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
val spark: SparkSession =
SparkSession
.builder()
.appName(getClass.getSimpleName)
.config(conf)
.getOrCreate()
val path = getClass.getResource("unibi_transformed.json").getPath
2022-01-11 16:57:48 +01:00
val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.goldHostedByDataset(spark, path)
2021-07-29 13:04:39 +02:00
assertEquals(29, ds.count)
assertEquals(29, ds.filter(hbi => !hbi.issn.equals("")).count)
assertEquals(0, ds.filter(hbi => !hbi.eissn.equals("")).count)
assertEquals(29, ds.filter(hbi => !hbi.lissn.equals("")).count)
2021-07-27 12:27:26 +02:00
2022-01-11 16:57:48 +01:00
assertEquals(
0,
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
)
assertTrue(
ds.filter(hbi => hbi.issn.equals("2239-6101"))
.first()
.officialname
.equals("European journal of sustainable development.")
)
2021-07-29 13:04:39 +02:00
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().lissn.equals("2239-5938"))
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).count == 1)
ds.foreach(hbi => assertTrue(hbi.id.equals(Constants.UNIBI)))
ds.foreach(hbi => println(toHBIString(hbi)))
2021-07-27 12:27:26 +02:00
spark.close()
}
2021-07-28 10:24:13 +02:00
@Test
2022-01-11 16:57:48 +01:00
def readDoaj(): Unit = {
2021-07-29 13:04:39 +02:00
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
val spark: SparkSession =
SparkSession
.builder()
.appName(getClass.getSimpleName)
.config(conf)
.getOrCreate()
val path = getClass.getResource("doaj_transformed.json").getPath
2021-07-28 10:24:13 +02:00
2022-01-11 16:57:48 +01:00
val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.doajHostedByDataset(spark, path)
2021-07-29 13:04:39 +02:00
assertEquals(25, ds.count)
assertEquals(14, ds.filter(hbi => !hbi.issn.equals("")).count)
assertEquals(21, ds.filter(hbi => !hbi.eissn.equals("")).count)
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
2021-07-28 10:24:13 +02:00
2022-01-11 16:57:48 +01:00
assertEquals(
0,
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
)
assertTrue(
ds.filter(hbi => hbi.issn.equals("2077-3099"))
.first()
.officialname
.equals("Journal of Space Technology")
)
2021-07-29 13:04:39 +02:00
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().eissn.equals("2411-5029"))
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).count == 1)
assertTrue(ds.filter(hbi => hbi.eissn.equals("2077-2955")).first().issn.equals(""))
ds.foreach(hbi => assertTrue(hbi.id.equals(Constants.DOAJ)))
ds.foreach(hbi => println(toHBIString(hbi)))
2021-07-28 10:24:13 +02:00
2021-07-29 13:04:39 +02:00
spark.close()
}
@Test
2022-01-11 16:57:48 +01:00
def testAggregator(): Unit = {
2021-07-28 10:24:13 +02:00
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
val spark: SparkSession =
SparkSession
.builder()
.appName(getClass.getSimpleName)
.config(conf)
.getOrCreate()
2022-01-11 16:57:48 +01:00
val tmp = SparkProduceHostedByMap
.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
.union(
SparkProduceHostedByMap
.goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath)
)
.union(
SparkProduceHostedByMap
.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)
)
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
)
2021-07-29 13:04:39 +02:00
assertEquals(106, tmp.count)
assertEquals(82, tmp.map(i => i._1)(Encoders.STRING).distinct().count)
2021-07-28 10:24:13 +02:00
2022-01-11 16:57:48 +01:00
val ds: Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType(
SparkProduceHostedByMap
.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
.union(
SparkProduceHostedByMap
.goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath)
)
.union(
SparkProduceHostedByMap
.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)
)
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
)
)
assertEquals(82, ds.count)
2021-07-29 13:04:39 +02:00
assertEquals(13, ds.filter(i => i._2.id.startsWith("10|")).count)
assertTrue(ds.filter(i => i._1.equals("2077-3757")).first()._2.id.startsWith("10|"))
assertTrue(ds.filter(i => i._1.equals("2077-3757")).first()._2.openAccess)
assertEquals(1, ds.filter(i => i._1.equals("2077-3757")).count)
2022-01-11 16:57:48 +01:00
val hbmap: Dataset[String] = ds
.filter(hbi => hbi._2.id.startsWith("10|"))
.map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING)
2021-07-29 13:04:39 +02:00
hbmap.foreach(entry => println(entry))
2021-07-28 10:24:13 +02:00
spark.close()
2021-07-29 13:04:39 +02:00
2021-07-28 10:24:13 +02:00
}
2021-07-27 12:27:26 +02:00
}