enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
16 changed files with 100 additions and 98 deletions
Showing only changes of commit b87b3ddb6b - Show all commits

View File

@ -2,8 +2,10 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.stream.Collectors;
public class Result extends OafEntity implements Serializable { public class Result extends OafEntity implements Serializable {
@ -248,13 +250,24 @@ public class Result extends OafEntity implements Serializable {
StructuredProperty baseMainTitle = null; StructuredProperty baseMainTitle = null;
if (title != null) { if (title != null) {
baseMainTitle = getMainTitle(title); baseMainTitle = getMainTitle(title);
title.remove(baseMainTitle); if (baseMainTitle != null) {
final StructuredProperty p = baseMainTitle;
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
}
//
//
// title.remove(baseMainTitle);
} }
StructuredProperty newMainTitle = null; StructuredProperty newMainTitle = null;
if (r.getTitle() != null) { if (r.getTitle() != null) {
newMainTitle = getMainTitle(r.getTitle()); newMainTitle = getMainTitle(r.getTitle());
r.getTitle().remove(newMainTitle); if (newMainTitle != null) {
final StructuredProperty p = newMainTitle;
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
}
// r.getTitle().remove(newMainTitle);
} }
if (newMainTitle != null && compareTrust(this, r) < 0) if (newMainTitle != null && compareTrust(this, r) < 0)

View File

@ -86,7 +86,7 @@ object DoiBoostMappingUtil {
} }
def generateGridAffiliationId(gridId:String) :String = { def generateGridAffiliationId(gridId:String) :String = {
s"10|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}" s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
} }

View File

@ -34,12 +34,12 @@ object SparkGenerateDoiBoost {
val workingDirPath = parser.get("workingDirPath") val workingDirPath = parser.get("workingDirPath")
logger.info("Phase 1) repartition and move all the dataset in a same working folder") // logger.info("Phase 1) repartition and move all the dataset in a same working folder")
spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication") // spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset") // spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication") // spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication") // spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication") // spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset] implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]

View File

@ -71,6 +71,8 @@ case object ConversionUtil {
} else { } else {
if (a == null) b else a if (a == null) b else a
} }
} }
def choiceLatestMagArtitcle(p1: MagPapers, p2:MagPapers) :MagPapers = { def choiceLatestMagArtitcle(p1: MagPapers, p2:MagPapers) :MagPapers = {

View File

@ -43,6 +43,9 @@ object SparkPreProcessMAG {
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result) val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct") distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
logger.info("Phase 6) Enrich Publication with description")
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
logger.info("Phase 3) Group Author by PaperId") logger.info("Phase 3) Group Author by PaperId")
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor] val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
@ -108,9 +111,9 @@ object SparkPreProcessMAG {
.save(s"${parser.get("targetPath")}/merge_step_3") .save(s"${parser.get("targetPath")}/merge_step_3")
logger.info("Phase 6) Enrich Publication with description") // logger.info("Phase 6) Enrich Publication with description")
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract] // val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract") // pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract] val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]

View File

@ -17,6 +17,10 @@ import scala.collection.JavaConverters._
case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,errorCode:String){} case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,errorCode:String){}
case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
object ORCIDToOAF { object ORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass) val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
val mapper = new ObjectMapper val mapper = new ObjectMapper
@ -45,45 +49,24 @@ object ORCIDToOAF {
} }
def convertTOOAF(input:String) :Publication = { def convertTOOAF(input:ORCIDElement) :Publication = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats val doi = input.doi
val item:(String, String) = extractValueFromInputString(input)
if (item== null) {
return null
}
val json_str = item._2
lazy val json: json4s.JValue = parse(json_str)
val doi = item._1
val pub:Publication = new Publication val pub:Publication = new Publication
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava) pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
pub.setDataInfo(generateDataInfo()) pub.setDataInfo(generateDataInfo())
pub.setId(generateIdentifier(pub, doi.toLowerCase)) pub.setId(generateIdentifier(pub, doi.toLowerCase))
try{ try{
val authorList:List[ORCIDItem] = json.extract[List[ORCIDItem]] pub.setAuthor(input.authors.map(a=> {
pub.setAuthor(authorList.map(a=> {
generateAuhtor(a.name, a.surname, a.creditName, a.oid) generateAuhtor(a.name, a.surname, a.creditName, a.oid)
}).asJava) }).asJava)
pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava) pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
pub pub
} catch { } catch {
case e: Throwable => case e: Throwable =>
logger.info(s"ERROR ON GENERATE Publication from $input") logger.info(s"ERROR ON GENERATE Publication from $input")
null null
} }
} }
def generateAuhtor(given: String, family: String, fullName:String, orcid: String): Author = { def generateAuhtor(given: String, family: String, fullName:String, orcid: String): Author = {

View File

@ -2,6 +2,7 @@ package eu.dnetlib.doiboost.orcid
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.mag.ConversionUtil
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
@ -25,17 +26,19 @@ object SparkConvertORCIDToOAF {
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master")).getOrCreate()
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication]) implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
import spark.implicits._
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
val inputRDD:RDD[String] = spark.sparkContext.textFile(s"$sourcePath") val dataset:Dataset[ORCIDElement] = spark.read.json(sourcePath).as[ORCIDElement]
println(s"SourcePath is $sourcePath, targetPath is $targetPath master is ${parser.get("master")} ")
logger.info("Converting ORCID to OAF") logger.info("Converting ORCID to OAF")
val d:Dataset[Publication] = spark.createDataset(inputRDD.map(ORCIDToOAF.convertTOOAF).filter(p=>p!=null)).as[Publication] val d:RDD[Publication] = dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null).map(p=>(p.getId,p)).rdd.reduceByKey(ConversionUtil.mergePublication)
d.write.mode(SaveMode.Overwrite).save(targetPath) .map(_._2)
spark.createDataset(d).as[Publication].write.mode(SaveMode.Overwrite).save(targetPath)
} }
} }

File diff suppressed because one or more lines are too long

View File

@ -52,7 +52,7 @@
<start to="GenerateActionSet"/> <start to="CreateDOIBoost"/>
<kill name="Kill"> <kill name="Kill">
@ -92,7 +92,7 @@
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg> <arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg> <arg>--master</arg><arg>yarn-cluster</arg>
</spark> </spark>
<ok to="GenerateActionSet"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>

View File

@ -15,6 +15,10 @@
<name>oozie.action.sharelib.for.spark</name> <name>oozie.action.sharelib.for.spark</name>
<value>spark2</value> <value>spark2</value>
</property> </property>
<property>
<name>oozie.wf.rerun.failnodes</name>
<value>false</value>
</property>
<property> <property>
<name>hive_metastore_uris</name> <name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value> <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>

View File

@ -31,8 +31,8 @@
<action name="ResetWorkingPath"> <action name="ResetWorkingPath">
<fs> <fs>
<delete path='${targetPath}/preprocess'/> <delete path='${targetPath}'/>
<mkdir path='${targetPath}/preprocess'/> <mkdir path='${targetPath}'/>
</fs> </fs>
<ok to="PreprocessMag"/> <ok to="PreprocessMag"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -3,11 +3,16 @@ package eu.dnetlib.dhp.doiboost
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass
import eu.dnetlib.doiboost.mag.ConversionUtil
import eu.dnetlib.doiboost.orcid.ORCIDElement
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
import scala.io.Source
class DoiBoostHostedByMapTest { class DoiBoostHostedByMapTest {
@Test @Test
@ -19,7 +24,7 @@ class DoiBoostHostedByMapTest {
@Test @Test
def testFilter():Unit = { def testMerge():Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
@ -29,19 +34,23 @@ class DoiBoostHostedByMapTest {
.master("local[*]").getOrCreate() .master("local[*]").getOrCreate()
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset] implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub) implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
import spark.implicits._
val dataset:Dataset[ORCIDElement] = spark.read.json("/home/sandro/orcid").as[ORCIDElement]
dataset.show(false)
val pub =spark.read.load("/data/doiboost/doiboostPublicationFiltered").as[Publication]
val mapper = new ObjectMapper()
val map = DoiBoostMappingUtil.retrieveHostedByMap()
println(pub.map(p => DoiBoostMappingUtil.fixPublication(p, map)).count())
} }

View File

@ -24,28 +24,6 @@ class MappingORCIDToOAFTest {
} }
@Test
def testConvertOrcidToOAF():Unit ={
val json = Source.fromInputStream(getClass.getResourceAsStream("dataOutput")).mkString
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
assertNotNull(json)
assertFalse(json.isEmpty)
// json.lines.foreach(s => {
//
// })
val p :Publication = ORCIDToOAF.convertTOOAF(json.lines.next())
logger.info(mapper.writeValueAsString(p))
}

File diff suppressed because one or more lines are too long

View File

@ -1,9 +1,11 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import com.fasterxml.jackson.databind.ObjectMapper; import static org.junit.jupiter.api.Assertions.*;
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; import java.io.IOException;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; import java.io.StringReader;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
@ -11,10 +13,11 @@ import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.io.IOException; import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.StringReader;
import static org.junit.jupiter.api.Assertions.*; import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
public class XmlRecordFactoryTest { public class XmlRecordFactoryTest {
@ -31,7 +34,8 @@ public class XmlRecordFactoryTest {
ContextMapper contextMapper = new ContextMapper(); ContextMapper contextMapper = new ContextMapper();
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation, otherDsTypeId); XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation,
otherDsTypeId);
String xml = xmlRecordFactory.build(je); String xml = xmlRecordFactory.build(je);