enrichment steps #38
|
@ -2,8 +2,10 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class Result extends OafEntity implements Serializable {
|
||||
|
||||
|
@ -248,13 +250,24 @@ public class Result extends OafEntity implements Serializable {
|
|||
StructuredProperty baseMainTitle = null;
|
||||
if (title != null) {
|
||||
baseMainTitle = getMainTitle(title);
|
||||
title.remove(baseMainTitle);
|
||||
if (baseMainTitle != null) {
|
||||
final StructuredProperty p = baseMainTitle;
|
||||
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
|
||||
}
|
||||
//
|
||||
//
|
||||
// title.remove(baseMainTitle);
|
||||
}
|
||||
|
||||
StructuredProperty newMainTitle = null;
|
||||
if (r.getTitle() != null) {
|
||||
newMainTitle = getMainTitle(r.getTitle());
|
||||
r.getTitle().remove(newMainTitle);
|
||||
if (newMainTitle != null) {
|
||||
final StructuredProperty p = newMainTitle;
|
||||
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
// r.getTitle().remove(newMainTitle);
|
||||
}
|
||||
|
||||
if (newMainTitle != null && compareTrust(this, r) < 0)
|
||||
|
|
|
@ -86,7 +86,7 @@ object DoiBoostMappingUtil {
|
|||
}
|
||||
|
||||
def generateGridAffiliationId(gridId:String) :String = {
|
||||
s"10|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
|
||||
s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -34,12 +34,12 @@ object SparkGenerateDoiBoost {
|
|||
val workingDirPath = parser.get("workingDirPath")
|
||||
|
||||
|
||||
logger.info("Phase 1) repartition and move all the dataset in a same working folder")
|
||||
spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
|
||||
spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
|
||||
spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
|
||||
spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
|
||||
spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
|
||||
// logger.info("Phase 1) repartition and move all the dataset in a same working folder")
|
||||
// spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
|
||||
// spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
|
||||
// spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
|
||||
// spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
|
||||
// spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
|
||||
|
||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||
|
|
|
@ -71,6 +71,8 @@ case object ConversionUtil {
|
|||
} else {
|
||||
if (a == null) b else a
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
def choiceLatestMagArtitcle(p1: MagPapers, p2:MagPapers) :MagPapers = {
|
||||
|
|
|
@ -43,6 +43,9 @@ object SparkPreProcessMAG {
|
|||
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
|
||||
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
|
||||
|
||||
logger.info("Phase 6) Enrich Publication with description")
|
||||
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||
|
||||
logger.info("Phase 3) Group Author by PaperId")
|
||||
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
||||
|
@ -108,9 +111,9 @@ object SparkPreProcessMAG {
|
|||
.save(s"${parser.get("targetPath")}/merge_step_3")
|
||||
|
||||
|
||||
logger.info("Phase 6) Enrich Publication with description")
|
||||
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||
// logger.info("Phase 6) Enrich Publication with description")
|
||||
// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||
// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||
|
||||
val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
|
||||
|
||||
|
|
|
@ -17,6 +17,10 @@ import scala.collection.JavaConverters._
|
|||
|
||||
|
||||
case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,errorCode:String){}
|
||||
|
||||
|
||||
|
||||
case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
|
||||
object ORCIDToOAF {
|
||||
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
||||
val mapper = new ObjectMapper
|
||||
|
@ -45,45 +49,24 @@ object ORCIDToOAF {
|
|||
}
|
||||
|
||||
|
||||
def convertTOOAF(input:String) :Publication = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
val item:(String, String) = extractValueFromInputString(input)
|
||||
|
||||
if (item== null) {
|
||||
return null
|
||||
}
|
||||
|
||||
val json_str = item._2
|
||||
lazy val json: json4s.JValue = parse(json_str)
|
||||
|
||||
val doi = item._1
|
||||
|
||||
def convertTOOAF(input:ORCIDElement) :Publication = {
|
||||
val doi = input.doi
|
||||
val pub:Publication = new Publication
|
||||
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
|
||||
pub.setDataInfo(generateDataInfo())
|
||||
pub.setId(generateIdentifier(pub, doi.toLowerCase))
|
||||
|
||||
|
||||
|
||||
|
||||
try{
|
||||
val authorList:List[ORCIDItem] = json.extract[List[ORCIDItem]]
|
||||
|
||||
|
||||
pub.setAuthor(authorList.map(a=> {
|
||||
pub.setAuthor(input.authors.map(a=> {
|
||||
generateAuhtor(a.name, a.surname, a.creditName, a.oid)
|
||||
}).asJava)
|
||||
|
||||
|
||||
pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
|
||||
pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
|
||||
pub
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
logger.info(s"ERROR ON GENERATE Publication from $input")
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def generateAuhtor(given: String, family: String, fullName:String, orcid: String): Author = {
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.doiboost.orcid
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||
import eu.dnetlib.doiboost.mag.ConversionUtil
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
|
@ -25,17 +26,19 @@ object SparkConvertORCIDToOAF {
|
|||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication])
|
||||
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||
import spark.implicits._
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
val inputRDD:RDD[String] = spark.sparkContext.textFile(s"$sourcePath")
|
||||
val dataset:Dataset[ORCIDElement] = spark.read.json(sourcePath).as[ORCIDElement]
|
||||
|
||||
println(s"SourcePath is $sourcePath, targetPath is $targetPath master is ${parser.get("master")} ")
|
||||
|
||||
logger.info("Converting ORCID to OAF")
|
||||
val d:Dataset[Publication] = spark.createDataset(inputRDD.map(ORCIDToOAF.convertTOOAF).filter(p=>p!=null)).as[Publication]
|
||||
d.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
val d:RDD[Publication] = dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null).map(p=>(p.getId,p)).rdd.reduceByKey(ConversionUtil.mergePublication)
|
||||
.map(_._2)
|
||||
|
||||
spark.createDataset(d).as[Publication].write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -52,7 +52,7 @@
|
|||
|
||||
|
||||
|
||||
<start to="GenerateActionSet"/>
|
||||
<start to="CreateDOIBoost"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
|
@ -92,7 +92,7 @@
|
|||
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="GenerateActionSet"/>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
|
@ -15,6 +15,10 @@
|
|||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.wf.rerun.failnodes</name>
|
||||
<value>false</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
|
|
|
@ -31,8 +31,8 @@
|
|||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${targetPath}/preprocess'/>
|
||||
<mkdir path='${targetPath}/preprocess'/>
|
||||
<delete path='${targetPath}'/>
|
||||
<mkdir path='${targetPath}'/>
|
||||
</fs>
|
||||
<ok to="PreprocessMag"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -3,11 +3,16 @@ package eu.dnetlib.dhp.doiboost
|
|||
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||
import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass
|
||||
import eu.dnetlib.doiboost.mag.ConversionUtil
|
||||
import eu.dnetlib.doiboost.orcid.ORCIDElement
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
class DoiBoostHostedByMapTest {
|
||||
|
||||
@Test
|
||||
|
@ -19,7 +24,7 @@ class DoiBoostHostedByMapTest {
|
|||
|
||||
|
||||
@Test
|
||||
def testFilter():Unit = {
|
||||
def testMerge():Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
|
@ -29,19 +34,23 @@ class DoiBoostHostedByMapTest {
|
|||
.master("local[*]").getOrCreate()
|
||||
|
||||
|
||||
|
||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
||||
|
||||
|
||||
import spark.implicits._
|
||||
val dataset:Dataset[ORCIDElement] = spark.read.json("/home/sandro/orcid").as[ORCIDElement]
|
||||
|
||||
|
||||
dataset.show(false)
|
||||
|
||||
|
||||
|
||||
val pub =spark.read.load("/data/doiboost/doiboostPublicationFiltered").as[Publication]
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
val map = DoiBoostMappingUtil.retrieveHostedByMap()
|
||||
|
||||
println(pub.map(p => DoiBoostMappingUtil.fixPublication(p, map)).count())
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -24,28 +24,6 @@ class MappingORCIDToOAFTest {
|
|||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testConvertOrcidToOAF():Unit ={
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("dataOutput")).mkString
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty)
|
||||
// json.lines.foreach(s => {
|
||||
//
|
||||
// })
|
||||
|
||||
|
||||
val p :Publication = ORCIDToOAF.convertTOOAF(json.lines.next())
|
||||
|
||||
|
||||
logger.info(mapper.writeValueAsString(p))
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,9 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
|
@ -11,37 +13,39 @@ import org.dom4j.io.SAXReader;
|
|||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
|
||||
|
||||
public class XmlRecordFactoryTest {
|
||||
|
||||
private static final String otherDsTypeId = "scholarcomminfra,infospace,pubsrepository::mock,entityregistry,entityregistry::projects,entityregistry::repositories,websource";
|
||||
private static final String otherDsTypeId = "scholarcomminfra,infospace,pubsrepository::mock,entityregistry,entityregistry::projects,entityregistry::repositories,websource";
|
||||
|
||||
@Test
|
||||
public void testXMLRecordFactory() throws IOException, DocumentException {
|
||||
@Test
|
||||
public void testXMLRecordFactory() throws IOException, DocumentException {
|
||||
|
||||
String json = IOUtils.toString(getClass().getResourceAsStream("joined_entity.json"));
|
||||
String json = IOUtils.toString(getClass().getResourceAsStream("joined_entity.json"));
|
||||
|
||||
assertNotNull(json);
|
||||
JoinedEntity je = new ObjectMapper().readValue(json, JoinedEntity.class);
|
||||
assertNotNull(je);
|
||||
assertNotNull(json);
|
||||
JoinedEntity je = new ObjectMapper().readValue(json, JoinedEntity.class);
|
||||
assertNotNull(je);
|
||||
|
||||
ContextMapper contextMapper = new ContextMapper();
|
||||
ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation, otherDsTypeId);
|
||||
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation,
|
||||
otherDsTypeId);
|
||||
|
||||
String xml = xmlRecordFactory.build(je);
|
||||
String xml = xmlRecordFactory.build(je);
|
||||
|
||||
assertNotNull(xml);
|
||||
assertNotNull(xml);
|
||||
|
||||
Document doc = new SAXReader().read(new StringReader(xml));
|
||||
Document doc = new SAXReader().read(new StringReader(xml));
|
||||
|
||||
assertNotNull(doc);
|
||||
assertNotNull(doc);
|
||||
|
||||
System.out.println(doc.asXML());
|
||||
System.out.println(doc.asXML());
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue