forked from D-Net/dnet-hadoop
updated workflow baseline to direct transform on OAF
This commit is contained in:
parent
c1cef5d685
commit
0d0904f4ec
|
@ -1,7 +1,7 @@
|
||||||
[
|
[
|
||||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||||
{"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
|
{"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
|
||||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the baseline path", "paramRequired": true},
|
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the baseline path", "paramRequired": true},
|
||||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the mdstore path to save", "paramRequired": true}
|
{"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the mdstore path to save", "paramRequired": true}
|
||||||
|
|
||||||
]
|
]
|
|
@ -16,11 +16,6 @@
|
||||||
<name>mdStoreManagerURI</name>
|
<name>mdStoreManagerURI</name>
|
||||||
<description>the path of the cleaned mdstore</description>
|
<description>the path of the cleaned mdstore</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>skipUpdate</name>
|
|
||||||
<value>false</value>
|
|
||||||
<description>The request block size</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="StartTransaction"/>
|
<start to="StartTransaction"/>
|
||||||
|
@ -44,16 +39,16 @@
|
||||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
<capture-output/>
|
<capture-output/>
|
||||||
</java>
|
</java>
|
||||||
<ok to="ConvertDataset"/>
|
<ok to="TransformPubMed"/>
|
||||||
<error to="RollBack"/>
|
<error to="RollBack"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="ConvertDataset">
|
<action name="TransformPubMed">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Convert Baseline to OAF Dataset</name>
|
<name>Convert Baseline Pubmed to OAF Dataset</name>
|
||||||
<class>eu.dnetlib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame</class>
|
<class>eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump</class>
|
||||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -65,12 +60,10 @@
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--workingPath</arg><arg>${baselineWorkingPath}</arg>
|
<arg>--sourcePath</arg><arg>${baselineWorkingPath}</arg>
|
||||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
<arg>--master</arg><arg>yarn</arg>
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>--skipUpdate</arg><arg>${skipUpdate}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="CommitVersion"/>
|
<ok to="CommitVersion"/>
|
||||||
<error to="RollBack"/>
|
<error to="RollBack"/>
|
||||||
|
|
|
@ -1,18 +1,14 @@
|
||||||
package eu.dnetlib.dhp.sx.bio.ebi
|
package eu.dnetlib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.{AbstractScalaApplication, ArgumentApplicationParser}
|
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
||||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PubMedToOaf}
|
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser2, PubMedToOaf}
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||||
import org.apache.spark.sql.functions._
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream
|
|
||||||
import javax.xml.stream.XMLInputFactory
|
|
||||||
|
|
||||||
class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logger)
|
class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logger)
|
||||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||||
|
|
||||||
|
@ -24,16 +20,26 @@ class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logg
|
||||||
log.info("isLookupUrl: {}", isLookupUrl)
|
log.info("isLookupUrl: {}", isLookupUrl)
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
log.info(s"SourcePath is '$sourcePath'")
|
log.info(s"SourcePath is '$sourcePath'")
|
||||||
val targetPath = parser.get("targetPath")
|
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||||
log.info(s"TargetPath is '$targetPath'")
|
log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
|
||||||
|
val mapper = new ObjectMapper()
|
||||||
|
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||||
|
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||||
|
log.info(s"outputBasePath is '$outputBasePath'")
|
||||||
|
|
||||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||||
|
|
||||||
createPubmedDump(spark, sourcePath, targetPath, vocabularies)
|
createPubmedDump(spark, sourcePath, outputBasePath, vocabularies)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** This method creates a dump of the pubmed articles
|
||||||
|
* @param spark the spark session
|
||||||
|
* @param sourcePath the path of the source file
|
||||||
|
* @param targetPath the path of the target file
|
||||||
|
* @param vocabularies the vocabularies
|
||||||
|
*/
|
||||||
def createPubmedDump(
|
def createPubmedDump(
|
||||||
spark: SparkSession,
|
spark: SparkSession,
|
||||||
sourcePath: String,
|
sourcePath: String,
|
||||||
|
@ -54,6 +60,7 @@ class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logg
|
||||||
})
|
})
|
||||||
.filter(s => s != null)
|
.filter(s => s != null)
|
||||||
.map { i =>
|
.map { i =>
|
||||||
|
//remove try catch
|
||||||
try {
|
try {
|
||||||
new PMParser2().parse(i)
|
new PMParser2().parse(i)
|
||||||
} catch {
|
} catch {
|
||||||
|
|
Loading…
Reference in New Issue