enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
16 changed files with 100 additions and 98 deletions
Showing only changes of commit b87b3ddb6b - Show all commits

View File

@ -2,8 +2,10 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
public class Result extends OafEntity implements Serializable {
@ -248,13 +250,24 @@ public class Result extends OafEntity implements Serializable {
StructuredProperty baseMainTitle = null;
if (title != null) {
baseMainTitle = getMainTitle(title);
title.remove(baseMainTitle);
if (baseMainTitle != null) {
final StructuredProperty p = baseMainTitle;
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
}
//
//
// title.remove(baseMainTitle);
}
StructuredProperty newMainTitle = null;
if (r.getTitle() != null) {
newMainTitle = getMainTitle(r.getTitle());
r.getTitle().remove(newMainTitle);
if (newMainTitle != null) {
final StructuredProperty p = newMainTitle;
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
}
// r.getTitle().remove(newMainTitle);
}
if (newMainTitle != null && compareTrust(this, r) < 0)

View File

@ -86,7 +86,7 @@ object DoiBoostMappingUtil {
}
def generateGridAffiliationId(gridId:String) :String = {
s"10|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
}

View File

@ -34,12 +34,12 @@ object SparkGenerateDoiBoost {
val workingDirPath = parser.get("workingDirPath")
logger.info("Phase 1) repartition and move all the dataset in a same working folder")
spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
// logger.info("Phase 1) repartition and move all the dataset in a same working folder")
// spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
// spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
// spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
// spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
// spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]

View File

@ -71,6 +71,8 @@ case object ConversionUtil {
} else {
if (a == null) b else a
}
}
def choiceLatestMagArtitcle(p1: MagPapers, p2:MagPapers) :MagPapers = {

View File

@ -43,6 +43,9 @@ object SparkPreProcessMAG {
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
logger.info("Phase 6) Enrich Publication with description")
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
logger.info("Phase 3) Group Author by PaperId")
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
@ -108,9 +111,9 @@ object SparkPreProcessMAG {
.save(s"${parser.get("targetPath")}/merge_step_3")
logger.info("Phase 6) Enrich Publication with description")
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
// logger.info("Phase 6) Enrich Publication with description")
// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]

View File

@ -17,6 +17,10 @@ import scala.collection.JavaConverters._
case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,errorCode:String){}
case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
object ORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
val mapper = new ObjectMapper
@ -45,45 +49,24 @@ object ORCIDToOAF {
}
def convertTOOAF(input:String) :Publication = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
val item:(String, String) = extractValueFromInputString(input)
if (item== null) {
return null
}
val json_str = item._2
lazy val json: json4s.JValue = parse(json_str)
val doi = item._1
def convertTOOAF(input:ORCIDElement) :Publication = {
val doi = input.doi
val pub:Publication = new Publication
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
pub.setDataInfo(generateDataInfo())
pub.setId(generateIdentifier(pub, doi.toLowerCase))
try{
val authorList:List[ORCIDItem] = json.extract[List[ORCIDItem]]
pub.setAuthor(authorList.map(a=> {
pub.setAuthor(input.authors.map(a=> {
generateAuhtor(a.name, a.surname, a.creditName, a.oid)
}).asJava)
pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
pub
} catch {
case e: Throwable =>
logger.info(s"ERROR ON GENERATE Publication from $input")
null
}
}
def generateAuhtor(given: String, family: String, fullName:String, orcid: String): Author = {

View File

@ -2,6 +2,7 @@ package eu.dnetlib.doiboost.orcid
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.mag.ConversionUtil
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
@ -25,17 +26,19 @@ object SparkConvertORCIDToOAF {
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication])
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
import spark.implicits._
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val inputRDD:RDD[String] = spark.sparkContext.textFile(s"$sourcePath")
val dataset:Dataset[ORCIDElement] = spark.read.json(sourcePath).as[ORCIDElement]
println(s"SourcePath is $sourcePath, targetPath is $targetPath master is ${parser.get("master")} ")
logger.info("Converting ORCID to OAF")
val d:Dataset[Publication] = spark.createDataset(inputRDD.map(ORCIDToOAF.convertTOOAF).filter(p=>p!=null)).as[Publication]
d.write.mode(SaveMode.Overwrite).save(targetPath)
val d:RDD[Publication] = dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null).map(p=>(p.getId,p)).rdd.reduceByKey(ConversionUtil.mergePublication)
.map(_._2)
spark.createDataset(d).as[Publication].write.mode(SaveMode.Overwrite).save(targetPath)
}
}

File diff suppressed because one or more lines are too long

View File

@ -52,7 +52,7 @@
<start to="GenerateActionSet"/>
<start to="CreateDOIBoost"/>
<kill name="Kill">
@ -92,7 +92,7 @@
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="GenerateActionSet"/>
<ok to="End"/>
<error to="Kill"/>
</action>

View File

@ -15,6 +15,10 @@
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.wf.rerun.failnodes</name>
<value>false</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>

View File

@ -31,8 +31,8 @@
<action name="ResetWorkingPath">
<fs>
<delete path='${targetPath}/preprocess'/>
<mkdir path='${targetPath}/preprocess'/>
<delete path='${targetPath}'/>
<mkdir path='${targetPath}'/>
</fs>
<ok to="PreprocessMag"/>
<error to="Kill"/>

View File

@ -3,11 +3,16 @@ package eu.dnetlib.dhp.doiboost
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass
import eu.dnetlib.doiboost.mag.ConversionUtil
import eu.dnetlib.doiboost.orcid.ORCIDElement
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
import org.junit.jupiter.api.Test
import scala.io.Source
class DoiBoostHostedByMapTest {
@Test
@ -19,7 +24,7 @@ class DoiBoostHostedByMapTest {
@Test
def testFilter():Unit = {
def testMerge():Unit = {
val conf: SparkConf = new SparkConf()
val spark: SparkSession =
SparkSession
@ -29,19 +34,23 @@ class DoiBoostHostedByMapTest {
.master("local[*]").getOrCreate()
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
import spark.implicits._
val dataset:Dataset[ORCIDElement] = spark.read.json("/home/sandro/orcid").as[ORCIDElement]
dataset.show(false)
val pub =spark.read.load("/data/doiboost/doiboostPublicationFiltered").as[Publication]
val mapper = new ObjectMapper()
val map = DoiBoostMappingUtil.retrieveHostedByMap()
println(pub.map(p => DoiBoostMappingUtil.fixPublication(p, map)).count())
}

View File

@ -24,28 +24,6 @@ class MappingORCIDToOAFTest {
}
@Test
def testConvertOrcidToOAF():Unit ={
val json = Source.fromInputStream(getClass.getResourceAsStream("dataOutput")).mkString
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
assertNotNull(json)
assertFalse(json.isEmpty)
// json.lines.foreach(s => {
//
// })
val p :Publication = ORCIDToOAF.convertTOOAF(json.lines.next())
logger.info(mapper.writeValueAsString(p))
}

File diff suppressed because one or more lines are too long

View File

@ -1,9 +1,11 @@
package eu.dnetlib.dhp.oa.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.io.StringReader;
import org.apache.commons.io.IOUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
@ -11,10 +13,11 @@ import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.io.StringReader;
import com.fasterxml.jackson.databind.ObjectMapper;
import static org.junit.jupiter.api.Assertions.*;
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
public class XmlRecordFactoryTest {
@ -31,7 +34,8 @@ public class XmlRecordFactoryTest {
ContextMapper contextMapper = new ContextMapper();
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation, otherDsTypeId);
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation,
otherDsTypeId);
String xml = xmlRecordFactory.build(je);