forked from D-Net/dnet-hadoop
update crossref mapping to be transformed together with UnpayWall
This commit is contained in:
parent
58dbe71d39
commit
ece56f0178
|
@ -11,6 +11,12 @@
|
||||||
"paramDescription": "The base path of Crossref DUMP",
|
"paramDescription": "The base path of Crossref DUMP",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"paramName": "uw",
|
||||||
|
"paramLongName": "unpaywallPath",
|
||||||
|
"paramDescription": "The base path of unpaywall DUMP",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "t",
|
"paramName": "t",
|
||||||
"paramLongName": "targetPath",
|
"paramLongName": "targetPath",
|
||||||
|
|
|
@ -2,11 +2,15 @@
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>sourcePath</name>
|
<name>sourcePath</name>
|
||||||
<description>The base path of MAG DUMP CSV Tables</description>
|
<description>The base path of Crossref DUMP </description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>targetPath</name>
|
<name>targetPath</name>
|
||||||
<description>The base path of MAG DUMP CSV Tables</description>
|
<description>The targetPath</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>unpaywallPath</name>
|
||||||
|
<description>The base path of unpaywall DUMP </description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>isLookupUrl</name>
|
<name>isLookupUrl</name>
|
||||||
|
@ -42,6 +46,7 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--unpaywallPath</arg><arg>${unpaywallPath}</arg>
|
||||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
<arg>--master</arg><arg>yarn</arg>
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
|
|
|
@ -46,6 +46,10 @@ case class mappingFunder(name: String, DOI: Option[String], award: Option[List[S
|
||||||
|
|
||||||
case class CrossrefResult(oafType: String, body: String) {}
|
case class CrossrefResult(oafType: String, body: String) {}
|
||||||
|
|
||||||
|
case class UnpayWall(doi: String, is_oa: Boolean, best_oa_location: UnpayWallOALocation, oa_status: String) {}
|
||||||
|
|
||||||
|
case class UnpayWallOALocation(license: Option[String], url: String, host_type: Option[String]) {}
|
||||||
|
|
||||||
case object Crossref2Oaf {
|
case object Crossref2Oaf {
|
||||||
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
||||||
val mapper = new ObjectMapper
|
val mapper = new ObjectMapper
|
||||||
|
@ -87,6 +91,15 @@ case object Crossref2Oaf {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def createUnpayWallCollectedFrom(): KeyValue = {
|
||||||
|
|
||||||
|
val cf = new KeyValue
|
||||||
|
cf.setValue("UnpayWall")
|
||||||
|
cf.setKey(s"10|openaire____:${DHPUtils.md5("UnpayWall".toLowerCase)}")
|
||||||
|
cf
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
def generateDataInfo(): DataInfo = {
|
def generateDataInfo(): DataInfo = {
|
||||||
generateDataInfo("0.91")
|
generateDataInfo("0.91")
|
||||||
}
|
}
|
||||||
|
@ -289,6 +302,34 @@ case object Crossref2Oaf {
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def get_unpaywall_color(input: String): Option[OpenAccessRoute] = {
|
||||||
|
if (input == null || input.equalsIgnoreCase("close"))
|
||||||
|
return None
|
||||||
|
if (input.equalsIgnoreCase("green"))
|
||||||
|
return Some(OpenAccessRoute.green)
|
||||||
|
if (input.equalsIgnoreCase("bronze"))
|
||||||
|
return Some(OpenAccessRoute.bronze)
|
||||||
|
if (input.equalsIgnoreCase("hybrid"))
|
||||||
|
return Some(OpenAccessRoute.hybrid)
|
||||||
|
else
|
||||||
|
return Some(OpenAccessRoute.gold)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_color(input: String): Option[OpenAccessRoute] = {
|
||||||
|
if (input == null || input.equalsIgnoreCase("closed"))
|
||||||
|
return None
|
||||||
|
if (input.equalsIgnoreCase("green"))
|
||||||
|
return Some(OpenAccessRoute.green)
|
||||||
|
if (input.equalsIgnoreCase("bronze"))
|
||||||
|
return Some(OpenAccessRoute.bronze)
|
||||||
|
if (input.equalsIgnoreCase("hybrid"))
|
||||||
|
return Some(OpenAccessRoute.hybrid)
|
||||||
|
else
|
||||||
|
return Some(OpenAccessRoute.gold)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
def mappingResult(result: Result, json: JValue, instanceType: Qualifier, originalType: String): Result = {
|
def mappingResult(result: Result, json: JValue, instanceType: Qualifier, originalType: String): Result = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
|
||||||
|
@ -575,9 +616,15 @@ case object Crossref2Oaf {
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
def convert(input: String, vocabularies: VocabularyGroup): List[CrossrefResult] = {
|
def extract_doi(input: String): CrossrefDT = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: json4s.JValue = parse(input)
|
lazy val json: json4s.JValue = parse(input)
|
||||||
|
CrossrefDT(doi = (json \ "DOI").extract[String].toLowerCase, json = input, 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
def convert(input: CrossrefDT, uw: UnpayWall, vocabularies: VocabularyGroup): List[CrossrefResult] = {
|
||||||
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
lazy val json: json4s.JValue = parse(input.json)
|
||||||
|
|
||||||
var resultList: List[CrossrefResult] = List()
|
var resultList: List[CrossrefResult] = List()
|
||||||
|
|
||||||
|
@ -627,6 +674,32 @@ case object Crossref2Oaf {
|
||||||
CrossrefResult(s.getClass.getSimpleName, mapper.writeValueAsString(s))
|
CrossrefResult(s.getClass.getSimpleName, mapper.writeValueAsString(s))
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (uw != null) {
|
||||||
|
result.getCollectedfrom.add(createUnpayWallCollectedFrom())
|
||||||
|
val i: Instance = new Instance()
|
||||||
|
i.setCollectedfrom(createUnpayWallCollectedFrom())
|
||||||
|
if (uw.best_oa_location != null) {
|
||||||
|
|
||||||
|
i.setUrl(List(uw.best_oa_location.url).asJava)
|
||||||
|
if (uw.best_oa_location.license.isDefined) {
|
||||||
|
i.setLicense(field[String](uw.best_oa_location.license.get, null))
|
||||||
|
}
|
||||||
|
|
||||||
|
val colour = get_unpaywall_color(uw.oa_status)
|
||||||
|
if (colour.isDefined) {
|
||||||
|
val a = new AccessRight
|
||||||
|
a.setClassid(ModelConstants.ACCESS_RIGHT_OPEN)
|
||||||
|
a.setClassname(ModelConstants.ACCESS_RIGHT_OPEN)
|
||||||
|
a.setSchemeid(ModelConstants.DNET_ACCESS_MODES)
|
||||||
|
a.setSchemename(ModelConstants.DNET_ACCESS_MODES)
|
||||||
|
a.setOpenAccessRoute(colour.get)
|
||||||
|
i.setAccessright(a)
|
||||||
|
}
|
||||||
|
i.setPid(result.getPid)
|
||||||
|
result.getInstance().add(i)
|
||||||
|
}
|
||||||
|
}
|
||||||
if (!filterResult(result))
|
if (!filterResult(result))
|
||||||
List()
|
List()
|
||||||
else
|
else
|
||||||
|
|
|
@ -5,6 +5,8 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Dataset => OafDataset}
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Dataset => OafDataset}
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
|
import org.apache.spark.sql.functions.{col, lower}
|
||||||
|
import org.apache.spark.sql.types._
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
class SparkMapDumpIntoOAF(propertyPath: String, args: Array[String], log: Logger)
|
class SparkMapDumpIntoOAF(propertyPath: String, args: Array[String], log: Logger)
|
||||||
|
@ -18,12 +20,45 @@ class SparkMapDumpIntoOAF(propertyPath: String, args: Array[String], log: Logger
|
||||||
log.info("sourcePath: {}", sourcePath)
|
log.info("sourcePath: {}", sourcePath)
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
log.info("targetPath: {}", targetPath)
|
log.info("targetPath: {}", targetPath)
|
||||||
|
val unpaywallPath = parser.get("unpaywallPath")
|
||||||
|
log.info("unpaywallPath: {}", unpaywallPath)
|
||||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||||
log.info("isLookupUrl: {}", isLookupUrl)
|
log.info("isLookupUrl: {}", isLookupUrl)
|
||||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||||
require(vocabularies != null)
|
require(vocabularies != null)
|
||||||
transformCrossref(spark, sourcePath, targetPath, vocabularies)
|
transformCrossref(spark, sourcePath, targetPath, unpaywallPath, vocabularies)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def transformUnpayWall(spark: SparkSession, unpaywallPath: String, crossrefPath: String): Dataset[UnpayWall] = {
|
||||||
|
val schema = new StructType()
|
||||||
|
.add(StructField("doi", StringType))
|
||||||
|
.add(StructField("is_oa", BooleanType))
|
||||||
|
.add(
|
||||||
|
StructField(
|
||||||
|
"best_oa_location",
|
||||||
|
new StructType()
|
||||||
|
.add("host_type", StringType)
|
||||||
|
.add("license", StringType)
|
||||||
|
.add("url", StringType)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.add("oa_status", StringType)
|
||||||
|
|
||||||
|
import spark.implicits._
|
||||||
|
val cId = spark.read
|
||||||
|
.schema(new StructType().add("DOI", StringType))
|
||||||
|
.json(crossrefPath)
|
||||||
|
.withColumn("doi", lower(col("DOI")))
|
||||||
|
|
||||||
|
val uw = spark.read
|
||||||
|
.schema(schema)
|
||||||
|
.json(unpaywallPath)
|
||||||
|
.withColumn("doi", lower(col("doi")))
|
||||||
|
.where("is_oa = true and best_oa_location.url is not null")
|
||||||
|
|
||||||
|
uw.join(cId, uw("doi") === cId("doi"), "leftsemi").as[UnpayWall].cache()
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -31,12 +66,19 @@ class SparkMapDumpIntoOAF(propertyPath: String, args: Array[String], log: Logger
|
||||||
spark: SparkSession,
|
spark: SparkSession,
|
||||||
sourcePath: String,
|
sourcePath: String,
|
||||||
targetPath: String,
|
targetPath: String,
|
||||||
|
unpaywallPath: String,
|
||||||
vocabularies: VocabularyGroup
|
vocabularies: VocabularyGroup
|
||||||
): Unit = {
|
): Unit = {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
val dump = spark.read.text(sourcePath).as[String]
|
val dump: Dataset[String] = spark.read.text(sourcePath).as[String]
|
||||||
dump
|
|
||||||
.flatMap(s => Crossref2Oaf.convert(s, vocabularies))
|
val uw = transformUnpayWall(spark, unpaywallPath, sourcePath)
|
||||||
|
|
||||||
|
val crId = dump.map(s => Crossref2Oaf.extract_doi(s))
|
||||||
|
|
||||||
|
crId
|
||||||
|
.joinWith(uw, crId("doi") === uw("doi"), "left")
|
||||||
|
.flatMap(s => Crossref2Oaf.convert(s._1, s._2, vocabularies))
|
||||||
.write
|
.write
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.partitionBy("oafType")
|
.partitionBy("oafType")
|
||||||
|
|
|
@ -21,7 +21,6 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
|
||||||
super.setUpVocabulary()
|
super.setUpVocabulary()
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
def testMapping(): Unit = {
|
def testMapping(): Unit = {
|
||||||
val spark = SparkSession.builder().master("local[*]").appName("TransformCrossref").getOrCreate()
|
val spark = SparkSession.builder().master("local[*]").appName("TransformCrossref").getOrCreate()
|
||||||
|
|
||||||
|
@ -32,6 +31,7 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
|
||||||
spark,
|
spark,
|
||||||
sourcePath = "/home/sandro/Downloads/crossref",
|
sourcePath = "/home/sandro/Downloads/crossref",
|
||||||
targetPath = "/home/sandro/Downloads/crossref_transformed",
|
targetPath = "/home/sandro/Downloads/crossref_transformed",
|
||||||
|
unpaywallPath = null,
|
||||||
vocabularies = vocabularies
|
vocabularies = vocabularies
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -71,6 +71,8 @@ object UnpayWallToOAF {
|
||||||
}
|
}
|
||||||
|
|
||||||
def convertToOAF(input: String): Publication = {
|
def convertToOAF(input: String): Publication = {
|
||||||
|
|
||||||
|
|
||||||
val pub = new Publication
|
val pub = new Publication
|
||||||
|
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
|
Loading…
Reference in New Issue