implemented workflow of creation action set for scholexplorer

This commit is contained in:
Sandro La Bruzzo 2021-07-28 16:15:15 +02:00
parent df8715a1ec
commit 3d8f0f629b
4 changed files with 61 additions and 45 deletions

View File

@ -64,26 +64,24 @@ abstract class AbstractRestClient extends Iterator[String]{
.setSocketTimeout(timeout * 1000).build() .setSocketTimeout(timeout * 1000).build()
val client =HttpClientBuilder.create().setDefaultRequestConfig(config).build() val client =HttpClientBuilder.create().setDefaultRequestConfig(config).build()
var tries = 4 var tries = 4
try { while (tries > 0) {
while (tries > 0) {
println(s"requesting ${r.getURI}") println(s"requesting ${r.getURI}")
val response = client.execute(r) try {
println(s"get response with status${response.getStatusLine.getStatusCode}") val response = client.execute(r)
if (response.getStatusLine.getStatusCode > 400) { println(s"get response with status${response.getStatusLine.getStatusCode}")
tries -= 1 if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
}
else
return IOUtils.toString(response.getEntity.getContent)
} catch {
case e: Throwable =>
println(s"Error on requesting ${r.getURI}")
e.printStackTrace()
tries-=1
} }
else
return IOUtils.toString(response.getEntity.getContent)
} }
"" ""
} catch { }
case e: Throwable =>
throw new RuntimeException("Error on executing request ", e)
} finally try client.close()
catch {
case e: IOException =>
throw new RuntimeException("Unable to close client ", e)
}
}
getBufferData() getBufferData()
} }

View File

@ -9,6 +9,41 @@
<artifactId>dhp-graph-provision</artifactId> <artifactId>dhp-graph-provision</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<args>
<arg>-Xmax-classfile-name</arg>
<arg>200</arg>
</args>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies> <dependencies>
<dependency> <dependency>

View File

@ -43,7 +43,7 @@ object SparkCreateActionset {
val relation = spark.read.load(s"$sourcePath/relation").as[Relation] val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
.flatMap(r => List(r.getSource,r.getTarget)).distinct().write.save(s"$workingDirFolder/id_relation") .flatMap(r => List(r.getSource,r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String] val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
@ -56,35 +56,18 @@ object SparkCreateActionset {
relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
.write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf") .write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
log.info("saving publication") log.info("saving entities")
val publication:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/publication").as[Result].map(p => (p.getId, p)) val entities:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
publication
.joinWith(idRelation, publication("_1").equalTo(idRelation("value"))) entities.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
entities
.joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
.map(p => p._1._2) .map(p => p._1._2)
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf") .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
log.info("saving dataset")
val dataset:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/dataset").as[Result].map(p => (p.getId, p))
dataset
.joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
.map(p => p._1._2)
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
log.info("saving software")
val software:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/software").as[Result].map(p => (p.getId, p))
software
.joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
.map(p => p._1._2)
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
log.info("saving Other Research product")
val orp:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/otherresearchproduct").as[Result].map(p => (p.getId, p))
orp
.joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
.map(p => p._1._2)
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
} }
} }

View File

@ -14,7 +14,7 @@
</property> </property>
</parameters> </parameters>
<start to="ExportDataset"/> <start to="createActionSet"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@ -26,7 +26,7 @@
<mode>cluster</mode> <mode>cluster</mode>
<name>Create Action Set</name> <name>Create Action Set</name>
<class>eu.dnetlib.dhp.sx.provision.SparkCreateActionset</class> <class>eu.dnetlib.dhp.sx.provision.SparkCreateActionset</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar> <jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
@ -42,7 +42,7 @@
<arg>--workingDirFolder</arg><arg>${workingDirFolder}</arg> <arg>--workingDirFolder</arg><arg>${workingDirFolder}</arg>
<arg>--master</arg><arg>yarn-cluster</arg> <arg>--master</arg><arg>yarn-cluster</arg>
</spark> </spark>
<ok to="End"/> <ok to="SaveActionSet"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -53,7 +53,7 @@
<mode>cluster</mode> <mode>cluster</mode>
<name>Save Action Set</name> <name>Save Action Set</name>
<class>eu.dnetlib.dhp.sx.provision.SparkSaveActionSet</class> <class>eu.dnetlib.dhp.sx.provision.SparkSaveActionSet</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar> <jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}