minor and fixed wronf number is test because of change in the input resource

This commit is contained in:
Miriam Baglioni 2022-12-31 13:00:00 +01:00
parent 2cae97d049
commit 4dcd03b78e
7 changed files with 46 additions and 36 deletions

View File

@ -1,6 +1,5 @@
import java.io.IOException; import java.io.IOException;
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
@ -10,6 +9,7 @@ import com.github.imifou.jsonschema.module.addon.AddonModule;
import com.github.victools.jsonschema.generator.*; import com.github.victools.jsonschema.generator.*;
import eu.dnetlib.dhp.ExecCreateSchemas; import eu.dnetlib.dhp.ExecCreateSchemas;
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
import eu.dnetlib.dhp.oa.model.graph.Datasource; import eu.dnetlib.dhp.oa.model.graph.Datasource;
import eu.dnetlib.dhp.oa.model.graph.GraphResult; import eu.dnetlib.dhp.oa.model.graph.GraphResult;
import eu.dnetlib.dhp.oa.model.graph.Organization; import eu.dnetlib.dhp.oa.model.graph.Organization;

View File

@ -8,6 +8,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
@ -71,7 +72,8 @@ public class SparkUpdateProjectInfo implements Serializable {
String preparedInfoPath) { String preparedInfoPath) {
Dataset<CommunityResult> result = Utils.readPath(spark, inputPath, CommunityResult.class); Dataset<CommunityResult> result = Utils.readPath(spark, inputPath, CommunityResult.class);
Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class); Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class);
Dataset<CommunityResult> tmp = result
result
.joinWith( .joinWith(
resultProject, result.col("id").equalTo(resultProject.col("resultId")), resultProject, result.col("id").equalTo(resultProject.col("resultId")),
"left") "left")
@ -79,9 +81,7 @@ public class SparkUpdateProjectInfo implements Serializable {
CommunityResult r = value._1(); CommunityResult r = value._1();
Optional.ofNullable(value._2()).ifPresent(rp -> r.setProjects(rp.getProjectsList())); Optional.ofNullable(value._2()).ifPresent(rp -> r.setProjects(rp.getProjectsList()));
return r; return r;
}, Encoders.bean(CommunityResult.class)); }, Encoders.bean(CommunityResult.class))
long count = tmp.count();
tmp
.map( .map(
(MapFunction<CommunityResult, String>) cr -> new ObjectMapper().writeValueAsString(cr), (MapFunction<CommunityResult, String>) cr -> new ObjectMapper().writeValueAsString(cr),
Encoders.STRING()) Encoders.STRING())

View File

@ -88,7 +88,10 @@ public class SparkDumpFunderResults implements Serializable {
} else { } else {
String fName = p.getId().substring(3, p.getId().indexOf("_")).toUpperCase(); String fName = p.getId().substring(3, p.getId().indexOf("_")).toUpperCase();
if (fName.equalsIgnoreCase("ec")) { if (fName.equalsIgnoreCase("ec")) {
if (p.getId().contains("h2020")) { if(p.getId().contains("he")){
fName += "_HE";
}
else if (p.getId().contains("h2020")) {
fName += "_H2020"; fName += "_H2020";
} else { } else {
fName += "_FP7"; fName += "_FP7";

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.dump.funderresults;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -17,6 +18,8 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Constants; import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper; import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
@ -99,13 +102,19 @@ public class SparkResultLinkedToProject implements Serializable {
.map( .map(
t2._1(), t2._1(),
communityMap, Constants.DUMPTYPE.FUNDER.getType()); communityMap, Constants.DUMPTYPE.FUNDER.getType());
cr.setProjects(t2._2().getProjectsList()); if (cr != null) {
cr.setProjects(t2._2().getProjectsList());
}
return cr; return cr;
}, Encoders.bean(CommunityResult.class)) }, Encoders.bean(CommunityResult.class))
.filter(Objects::nonNull)
.map(
(MapFunction<CommunityResult, String>) cr -> new ObjectMapper().writeValueAsString(cr),
Encoders.STRING())
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .text(outputPath);
} }
} }

View File

@ -238,7 +238,7 @@
</property> </property>
<property> <property>
<name>outputPath</name> <name>outputPath</name>
<value>${workingDir}/tar</value> <value>${outputPath}/dump</value>
</property> </property>
<property> <property>
<name>sourcePath</name> <name>sourcePath</name>
@ -279,7 +279,7 @@
</property> </property>
<property> <property>
<name>outputPath</name> <name>outputPath</name>
<value>${workingDir}/tar</value> <value>${outputPath}/dump</value>
</property> </property>
</configuration> </configuration>
</sub-workflow> </sub-workflow>
@ -299,7 +299,7 @@
</property> </property>
<property> <property>
<name>outputPath</name> <name>outputPath</name>
<value>${workingDir}/tar</value> <value>${outputPath}/dump</value>
</property> </property>
<property> <property>
<name>sourcePath</name> <name>sourcePath</name>
@ -315,28 +315,28 @@
<error to="Kill" /> <error to="Kill" />
</action> </action>
<action name="make_archive">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.MakeTar</main-class>
<arg>--hdfsPath</arg><arg>${outputPath}</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--sourcePath</arg><arg>${workingDir}/tar</arg>
</java>
<ok to="should_upload"/>
<error to="Kill"/>
</action>
<!-- <action name="make_archive">--> <!-- <action name="make_archive">-->
<!-- <java>--> <!-- <java>-->
<!-- <main-class>eu.dnetlib.dhp.oa.graph.dump.MakeTar</main-class>--> <!-- <main-class>eu.dnetlib.dhp.oa.graph.dump.MakeTar</main-class>-->
<!-- <arg>&#45;&#45;hdfsPath</arg><arg>${outputPath}/tar</arg>--> <!-- <arg>&#45;&#45;hdfsPath</arg><arg>${outputPath}</arg>-->
<!-- <arg>&#45;&#45;nameNode</arg><arg>${nameNode}</arg>--> <!-- <arg>&#45;&#45;nameNode</arg><arg>${nameNode}</arg>-->
<!-- <arg>&#45;&#45;sourcePath</arg><arg>${outputPath}/dump</arg>--> <!-- <arg>&#45;&#45;sourcePath</arg><arg>${workingDir}/tar</arg>-->
<!-- </java>--> <!-- </java>-->
<!-- <ok to="should_upload"/>--> <!-- <ok to="should_upload"/>-->
<!-- <error to="Kill"/>--> <!-- <error to="Kill"/>-->
<!-- </action>--> <!-- </action>-->
<action name="make_archive">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.MakeTar</main-class>
<arg>--hdfsPath</arg><arg>${outputPath}/tar</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--sourcePath</arg><arg>${outputPath}/dump</arg>
</java>
<ok to="should_upload"/>
<error to="Kill"/>
</action>
<decision name="should_upload"> <decision name="should_upload">
<switch> <switch>
<case to="send_zenodo">${wf:conf('upload') eq true}</case> <case to="send_zenodo">${wf:conf('upload') eq true}</case>
@ -347,7 +347,7 @@
<action name="send_zenodo"> <action name="send_zenodo">
<java> <java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class> <main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>
<arg>--hdfsPath</arg><arg>${outputPath}</arg> <arg>--hdfsPath</arg><arg>${outputPath}/tar/</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg> <arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--accessToken</arg><arg>${accessToken}</arg> <arg>--accessToken</arg><arg>${accessToken}</arg>
<arg>--connectionUrl</arg><arg>${connectionUrl}</arg> <arg>--connectionUrl</arg><arg>${connectionUrl}</arg>

View File

@ -456,7 +456,7 @@ public class DumpSubsetTest {
getClass() getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/subset/dump/community_infrastructure") .getResource("/eu/dnetlib/dhp/oa/graph/dump/subset/dump/community_infrastructure")
.getPath()) .getPath())
.saveAsTextFile(workingDir.toString() + "/dump/community_infrastructure"); .saveAsTextFile(workingDir.toString() + "/dump/communities_infrastructures");
SparkSelectValidRelationContext SparkSelectValidRelationContext
.main( .main(
@ -512,10 +512,12 @@ public class DumpSubsetTest {
.textFile(workingDir.toString() + "/relation") .textFile(workingDir.toString() + "/relation")
.map(item -> OBJECT_MAPPER.readValue(item, eu.dnetlib.dhp.oa.model.graph.Relation.class)); .map(item -> OBJECT_MAPPER.readValue(item, eu.dnetlib.dhp.oa.model.graph.Relation.class));
Assertions.assertEquals(94, tmp.count()); Assertions.assertEquals(102, tmp.count());
Assertions.assertEquals(47, tmp.filter(r -> r.getSource().getId().startsWith("50|")).count());
Assertions.assertEquals(36, tmp.filter(r -> r.getSource().getId().startsWith("10|")).count());
Assertions.assertEquals(11, tmp.filter(r -> r.getSource().getId().startsWith("00|")).count()); Assertions.assertEquals(51, tmp.filter(r -> r.getSource().getId().startsWith("50|")).count());
Assertions.assertEquals(39, tmp.filter(r -> r.getSource().getId().startsWith("10|")).count());
Assertions.assertEquals(12, tmp.filter(r -> r.getSource().getId().startsWith("00|")).count());
} }
} }