2020-10-01 15:46:44 +02:00
|
|
|
package eu.dnetlib.dhp.doiboost
|
2020-10-13 08:47:58 +02:00
|
|
|
|
2020-11-06 17:12:31 +01:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation}
|
2020-10-08 10:10:13 +02:00
|
|
|
import org.apache.spark.SparkContext
|
2020-10-01 15:46:44 +02:00
|
|
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
2020-10-13 08:47:58 +02:00
|
|
|
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
2020-10-08 10:10:13 +02:00
|
|
|
import org.json4s
|
|
|
|
import org.json4s.DefaultFormats
|
|
|
|
import org.json4s.jackson.JsonMethods._
|
2020-10-13 08:47:58 +02:00
|
|
|
|
2020-10-01 15:46:44 +02:00
|
|
|
import scala.collection.JavaConverters._
|
|
|
|
class QueryTest {
|
|
|
|
|
2020-10-08 10:10:13 +02:00
|
|
|
def extract_payload(input:String) :String = {
|
2020-10-01 15:46:44 +02:00
|
|
|
|
2020-10-08 10:10:13 +02:00
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
|
|
lazy val json: json4s.JValue = parse(input)
|
2020-10-01 15:46:44 +02:00
|
|
|
|
|
|
|
|
2020-10-08 10:10:13 +02:00
|
|
|
compact(render((json \ "payload")))
|
2020-10-01 15:46:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2020-11-06 17:12:31 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def has_ands(r:Relation) :Boolean = {
|
|
|
|
|
|
|
|
r.getCollectedfrom!= null && r.getCollectedfrom.asScala.count(k => k.getValue.contains("Australian")) > 0
|
|
|
|
|
2020-10-01 15:46:44 +02:00
|
|
|
}
|
|
|
|
|
2020-10-13 08:47:58 +02:00
|
|
|
def hasInstanceWithUrl(p:Publication):Boolean = {
|
|
|
|
val c = p.getInstance.asScala.map(i => i.getUrl!= null && !i.getUrl.isEmpty).size
|
|
|
|
!(!p.getInstance.isEmpty && c == p.getInstance().size)
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def hasNullAccessRights(p:Publication):Boolean = {
|
|
|
|
val c = p.getInstance.asScala.map(i => i.getAccessright!= null && i.getAccessright.getClassname.nonEmpty).size
|
|
|
|
!p.getInstance.isEmpty && c == p.getInstance().size()
|
|
|
|
}
|
|
|
|
|
2020-10-01 15:46:44 +02:00
|
|
|
|
2020-10-08 10:10:13 +02:00
|
|
|
def myQuery(spark:SparkSession, sc:SparkContext): Unit = {
|
2020-10-13 08:47:58 +02:00
|
|
|
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
|
|
|
|
2020-11-02 09:26:25 +01:00
|
|
|
|
|
|
|
|
2020-10-13 08:47:58 +02:00
|
|
|
val mapper = new ObjectMapper()
|
|
|
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
|
|
|
|
|
|
|
|
|
|
|
val ds:Dataset[Publication] = spark.read.load("/tmp/p").as[Publication]
|
2020-10-01 15:46:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2020-10-13 08:47:58 +02:00
|
|
|
ds.filter(p =>p.getBestaccessright!= null && p.getBestaccessright.getClassname.nonEmpty).count()
|
2020-10-01 15:46:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|