minor and fixed wronf number is test because of change in the input resource

This commit is contained in:
Miriam Baglioni 2022-12-31 13:00:00 +01:00
parent 2cae97d049
commit 4dcd03b78e
7 changed files with 46 additions and 36 deletions

View File

@ -1,6 +1,5 @@
import java.io.IOException;
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.core.JsonProcessingException;
@ -10,6 +9,7 @@ import com.github.imifou.jsonschema.module.addon.AddonModule;
import com.github.victools.jsonschema.generator.*;
import eu.dnetlib.dhp.ExecCreateSchemas;
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
import eu.dnetlib.dhp.oa.model.graph.Datasource;
import eu.dnetlib.dhp.oa.model.graph.GraphResult;
import eu.dnetlib.dhp.oa.model.graph.Organization;

View File

@ -8,6 +8,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
@ -71,7 +72,8 @@ public class SparkUpdateProjectInfo implements Serializable {
String preparedInfoPath) {
Dataset<CommunityResult> result = Utils.readPath(spark, inputPath, CommunityResult.class);
Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class);
Dataset<CommunityResult> tmp = result
result
.joinWith(
resultProject, result.col("id").equalTo(resultProject.col("resultId")),
"left")
@ -79,9 +81,7 @@ public class SparkUpdateProjectInfo implements Serializable {
CommunityResult r = value._1();
Optional.ofNullable(value._2()).ifPresent(rp -> r.setProjects(rp.getProjectsList()));
return r;
}, Encoders.bean(CommunityResult.class));
long count = tmp.count();
tmp
}, Encoders.bean(CommunityResult.class))
.map(
(MapFunction<CommunityResult, String>) cr -> new ObjectMapper().writeValueAsString(cr),
Encoders.STRING())

View File

@ -88,7 +88,10 @@ public class SparkDumpFunderResults implements Serializable {
} else {
String fName = p.getId().substring(3, p.getId().indexOf("_")).toUpperCase();
if (fName.equalsIgnoreCase("ec")) {
if (p.getId().contains("h2020")) {
if(p.getId().contains("he")){
fName += "_HE";
}
else if (p.getId().contains("h2020")) {
fName += "_H2020";
} else {
fName += "_FP7";

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.dump.funderresults;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
@ -17,6 +18,8 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
@ -99,13 +102,19 @@ public class SparkResultLinkedToProject implements Serializable {
.map(
t2._1(),
communityMap, Constants.DUMPTYPE.FUNDER.getType());
cr.setProjects(t2._2().getProjectsList());
if (cr != null) {
cr.setProjects(t2._2().getProjectsList());
}
return cr;
}, Encoders.bean(CommunityResult.class))
.filter(Objects::nonNull)
.map(
(MapFunction<CommunityResult, String>) cr -> new ObjectMapper().writeValueAsString(cr),
Encoders.STRING())
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
.text(outputPath);
}
}

View File

@ -238,7 +238,7 @@
</property>
<property>
<name>outputPath</name>
<value>${workingDir}/tar</value>
<value>${outputPath}/dump</value>
</property>
<property>
<name>sourcePath</name>
@ -279,7 +279,7 @@
</property>
<property>
<name>outputPath</name>
<value>${workingDir}/tar</value>
<value>${outputPath}/dump</value>
</property>
</configuration>
</sub-workflow>
@ -299,7 +299,7 @@
</property>
<property>
<name>outputPath</name>
<value>${workingDir}/tar</value>
<value>${outputPath}/dump</value>
</property>
<property>
<name>sourcePath</name>
@ -315,28 +315,28 @@
<error to="Kill" />
</action>
<action name="make_archive">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.MakeTar</main-class>
<arg>--hdfsPath</arg><arg>${outputPath}</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--sourcePath</arg><arg>${workingDir}/tar</arg>
</java>
<ok to="should_upload"/>
<error to="Kill"/>
</action>
<!-- <action name="make_archive">-->
<!-- <java>-->
<!-- <main-class>eu.dnetlib.dhp.oa.graph.dump.MakeTar</main-class>-->
<!-- <arg>&#45;&#45;hdfsPath</arg><arg>${outputPath}/tar</arg>-->
<!-- <arg>&#45;&#45;hdfsPath</arg><arg>${outputPath}</arg>-->
<!-- <arg>&#45;&#45;nameNode</arg><arg>${nameNode}</arg>-->
<!-- <arg>&#45;&#45;sourcePath</arg><arg>${outputPath}/dump</arg>-->
<!-- <arg>&#45;&#45;sourcePath</arg><arg>${workingDir}/tar</arg>-->
<!-- </java>-->
<!-- <ok to="should_upload"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="make_archive">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.MakeTar</main-class>
<arg>--hdfsPath</arg><arg>${outputPath}/tar</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--sourcePath</arg><arg>${outputPath}/dump</arg>
</java>
<ok to="should_upload"/>
<error to="Kill"/>
</action>
<decision name="should_upload">
<switch>
<case to="send_zenodo">${wf:conf('upload') eq true}</case>
@ -347,7 +347,7 @@
<action name="send_zenodo">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>
<arg>--hdfsPath</arg><arg>${outputPath}</arg>
<arg>--hdfsPath</arg><arg>${outputPath}/tar/</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--accessToken</arg><arg>${accessToken}</arg>
<arg>--connectionUrl</arg><arg>${connectionUrl}</arg>

View File

@ -456,7 +456,7 @@ public class DumpSubsetTest {
getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/subset/dump/community_infrastructure")
.getPath())
.saveAsTextFile(workingDir.toString() + "/dump/community_infrastructure");
.saveAsTextFile(workingDir.toString() + "/dump/communities_infrastructures");
SparkSelectValidRelationContext
.main(
@ -512,10 +512,12 @@ public class DumpSubsetTest {
.textFile(workingDir.toString() + "/relation")
.map(item -> OBJECT_MAPPER.readValue(item, eu.dnetlib.dhp.oa.model.graph.Relation.class));
Assertions.assertEquals(94, tmp.count());
Assertions.assertEquals(47, tmp.filter(r -> r.getSource().getId().startsWith("50|")).count());
Assertions.assertEquals(36, tmp.filter(r -> r.getSource().getId().startsWith("10|")).count());
Assertions.assertEquals(11, tmp.filter(r -> r.getSource().getId().startsWith("00|")).count());
Assertions.assertEquals(102, tmp.count());
Assertions.assertEquals(51, tmp.filter(r -> r.getSource().getId().startsWith("50|")).count());
Assertions.assertEquals(39, tmp.filter(r -> r.getSource().getId().startsWith("10|")).count());
Assertions.assertEquals(12, tmp.filter(r -> r.getSource().getId().startsWith("00|")).count());
}
}