forked from antonis.lempesis/dnet-hadoop
implemented workflow of creation action set for scholexplorer
This commit is contained in:
parent
df8715a1ec
commit
3d8f0f629b
|
@ -64,26 +64,24 @@ abstract class AbstractRestClient extends Iterator[String]{
|
||||||
.setSocketTimeout(timeout * 1000).build()
|
.setSocketTimeout(timeout * 1000).build()
|
||||||
val client =HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
val client =HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||||
var tries = 4
|
var tries = 4
|
||||||
try {
|
while (tries > 0) {
|
||||||
while (tries > 0) {
|
|
||||||
println(s"requesting ${r.getURI}")
|
println(s"requesting ${r.getURI}")
|
||||||
val response = client.execute(r)
|
try {
|
||||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
val response = client.execute(r)
|
||||||
if (response.getStatusLine.getStatusCode > 400) {
|
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||||
tries -= 1
|
if (response.getStatusLine.getStatusCode > 400) {
|
||||||
|
tries -= 1
|
||||||
|
}
|
||||||
|
else
|
||||||
|
return IOUtils.toString(response.getEntity.getContent)
|
||||||
|
} catch {
|
||||||
|
case e: Throwable =>
|
||||||
|
println(s"Error on requesting ${r.getURI}")
|
||||||
|
e.printStackTrace()
|
||||||
|
tries-=1
|
||||||
}
|
}
|
||||||
else
|
|
||||||
return IOUtils.toString(response.getEntity.getContent)
|
|
||||||
}
|
}
|
||||||
""
|
""
|
||||||
} catch {
|
}
|
||||||
case e: Throwable =>
|
|
||||||
throw new RuntimeException("Error on executing request ", e)
|
|
||||||
} finally try client.close()
|
|
||||||
catch {
|
|
||||||
case e: IOException =>
|
|
||||||
throw new RuntimeException("Unable to close client ", e)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
getBufferData()
|
getBufferData()
|
||||||
}
|
}
|
|
@ -9,6 +9,41 @@
|
||||||
|
|
||||||
<artifactId>dhp-graph-provision</artifactId>
|
<artifactId>dhp-graph-provision</artifactId>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>net.alchim31.maven</groupId>
|
||||||
|
<artifactId>scala-maven-plugin</artifactId>
|
||||||
|
<version>4.0.1</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>scala-compile-first</id>
|
||||||
|
<phase>initialize</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
<goal>compile</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>scala-test-compile</id>
|
||||||
|
<phase>process-test-resources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>testCompile</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
<configuration>
|
||||||
|
<args>
|
||||||
|
<arg>-Xmax-classfile-name</arg>
|
||||||
|
<arg>200</arg>
|
||||||
|
</args>
|
||||||
|
<scalaVersion>${scala.version}</scalaVersion>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
|
||||||
|
</build>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
@ -43,7 +43,7 @@ object SparkCreateActionset {
|
||||||
val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
|
val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
|
||||||
|
|
||||||
relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
||||||
.flatMap(r => List(r.getSource,r.getTarget)).distinct().write.save(s"$workingDirFolder/id_relation")
|
.flatMap(r => List(r.getSource,r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
|
||||||
|
|
||||||
|
|
||||||
val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
|
val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
|
||||||
|
@ -56,35 +56,18 @@ object SparkCreateActionset {
|
||||||
relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
|
.write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
|
||||||
|
|
||||||
log.info("saving publication")
|
log.info("saving entities")
|
||||||
|
|
||||||
val publication:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/publication").as[Result].map(p => (p.getId, p))
|
val entities:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
|
||||||
|
|
||||||
publication
|
|
||||||
.joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
|
entities.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
|
||||||
|
entities
|
||||||
|
.joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
|
||||||
.map(p => p._1._2)
|
.map(p => p._1._2)
|
||||||
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
|
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
|
||||||
|
|
||||||
log.info("saving dataset")
|
|
||||||
val dataset:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/dataset").as[Result].map(p => (p.getId, p))
|
|
||||||
dataset
|
|
||||||
.joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
|
|
||||||
.map(p => p._1._2)
|
|
||||||
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
|
|
||||||
|
|
||||||
log.info("saving software")
|
|
||||||
val software:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/software").as[Result].map(p => (p.getId, p))
|
|
||||||
software
|
|
||||||
.joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
|
|
||||||
.map(p => p._1._2)
|
|
||||||
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
|
|
||||||
|
|
||||||
log.info("saving Other Research product")
|
|
||||||
val orp:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/otherresearchproduct").as[Result].map(p => (p.getId, p))
|
|
||||||
orp
|
|
||||||
.joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
|
|
||||||
.map(p => p._1._2)
|
|
||||||
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ExportDataset"/>
|
<start to="createActionSet"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
@ -26,7 +26,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Create Action Set</name>
|
<name>Create Action Set</name>
|
||||||
<class>eu.dnetlib.dhp.sx.provision.SparkCreateActionset</class>
|
<class>eu.dnetlib.dhp.sx.provision.SparkCreateActionset</class>
|
||||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
@ -42,7 +42,7 @@
|
||||||
<arg>--workingDirFolder</arg><arg>${workingDirFolder}</arg>
|
<arg>--workingDirFolder</arg><arg>${workingDirFolder}</arg>
|
||||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="SaveActionSet"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -53,7 +53,7 @@
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Save Action Set</name>
|
<name>Save Action Set</name>
|
||||||
<class>eu.dnetlib.dhp.sx.provision.SparkSaveActionSet</class>
|
<class>eu.dnetlib.dhp.sx.provision.SparkSaveActionSet</class>
|
||||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
|
Loading…
Reference in New Issue