enrichment steps #38
|
@ -75,17 +75,27 @@ public class PrepareProjects {
|
|||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
private static void exec(SparkSession spark, String progjectPath, String dbProjectPath, String outputPath) {
|
||||
Dataset<CSVProject> project = readPath(spark, progjectPath, CSVProject.class);
|
||||
private static void exec(SparkSession spark, String projectPath, String dbProjectPath, String outputPath) {
|
||||
Dataset<CSVProject> project = readPath(spark, projectPath, CSVProject.class);
|
||||
Dataset<ProjectSubset> dbProjects = readPath(spark, dbProjectPath, ProjectSubset.class);
|
||||
|
||||
dbProjects.joinWith(project, dbProjects.col("code").equalTo(project.col("id")), "left")
|
||||
.flatMap((FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject>) value -> {
|
||||
Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
|
||||
if(! csvProject.isPresent()){
|
||||
return null;
|
||||
dbProjects
|
||||
.joinWith(project, dbProjects.col("code").equalTo(project.col("id")), "left")
|
||||
.flatMap(getTuple2CSVProjectFlatMapFunction(), Encoders.bean(CSVProject.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
private static FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject> getTuple2CSVProjectFlatMapFunction() {
|
||||
return (FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject>) value -> {
|
||||
Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
|
||||
List<CSVProject> csvProjectList = new ArrayList<>();
|
||||
if (csvProject.isPresent()) {
|
||||
|
||||
String[] programme = csvProject.get().getProgramme().split(";");
|
||||
Arrays
|
||||
.stream(programme)
|
||||
|
@ -95,36 +105,9 @@ public class PrepareProjects {
|
|||
proj.setId(csvProject.get().getId());
|
||||
csvProjectList.add(proj);
|
||||
});
|
||||
|
||||
}
|
||||
return csvProjectList.iterator();
|
||||
}, Encoders.bean(CSVProject.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
//
|
||||
// .map(value -> {
|
||||
// Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
|
||||
// }, Encoders.bean(CSVProject.class))
|
||||
// .filter(Objects::nonNull)
|
||||
// .toJavaRDD()
|
||||
// .flatMap(p -> {
|
||||
// List<CSVProject> csvProjectList = new ArrayList<>();
|
||||
// String[] programme = p.getProgramme().split(";");
|
||||
// Arrays
|
||||
// .stream(programme)
|
||||
// .forEach(value -> {
|
||||
// CSVProject csvProject = new CSVProject();
|
||||
// csvProject.setProgramme(value);
|
||||
// csvProject.setId(p.getId());
|
||||
// csvProjectList.add(csvProject);
|
||||
// });
|
||||
//
|
||||
// return csvProjectList.iterator();
|
||||
// })
|
||||
// .map(p -> OBJECT_MAPPER.writeValueAsString(p))
|
||||
// .saveAsTextFile(outputPath);
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
|
|
|
@ -1,15 +1,5 @@
|
|||
package eu.dnetlib.dhp.actionmanager.project;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.DbClient;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
package eu.dnetlib.dhp.actionmanager.project;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.Closeable;
|
||||
|
@ -22,6 +12,19 @@ import java.util.List;
|
|||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.DbClient;
|
||||
|
||||
public class ReadProjectsFromDB implements Closeable {
|
||||
|
||||
private final DbClient dbClient;
|
||||
|
@ -52,11 +55,12 @@ public class ReadProjectsFromDB implements Closeable {
|
|||
try (final ReadProjectsFromDB rbl = new ReadProjectsFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
|
||||
dbPassword)) {
|
||||
|
||||
log.info("Processing blacklist...");
|
||||
log.info("Processing projects...");
|
||||
rbl.execute(query, rbl::processProjectsEntry);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public void execute(final String sql, final Function<ResultSet, List<ProjectSubset>> producer) throws Exception {
|
||||
|
||||
final Consumer<ResultSet> consumer = rs -> producer.apply(rs).forEach(r -> writeProject(r));
|
||||
|
@ -100,7 +104,6 @@ public class ReadProjectsFromDB implements Closeable {
|
|||
}
|
||||
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||
|
||||
|
||||
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
|
@ -110,4 +113,3 @@ public class ReadProjectsFromDB implements Closeable {
|
|||
writer.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -96,7 +96,7 @@
|
|||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareProgramme</name>
|
||||
<name>PrepareProjects</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.project.PrepareProjects</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
|
@ -111,6 +111,7 @@
|
|||
</spark-opts>
|
||||
<arg>--projectPath</arg><arg>${workingDir}/projects</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/preparedProjects</arg>
|
||||
<arg>--dbProjectPath</arg><arg>${workingDir}/dbProjects</arg>
|
||||
</spark>
|
||||
<ok to="create_updates"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -67,7 +67,7 @@ public class PrepareProjectTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void numberDistinctProgrammeTest() throws Exception {
|
||||
public void numberDistinctProjectTest() throws Exception {
|
||||
PrepareProjects
|
||||
.main(
|
||||
new String[] {
|
||||
|
@ -76,7 +76,10 @@ public class PrepareProjectTest {
|
|||
"-projectPath",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/projects_subset.json").getPath(),
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/preparedProjects"
|
||||
workingDir.toString() + "/preparedProjects",
|
||||
"-dbProjectPath",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/dbProject").getPath(),
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
@ -85,7 +88,7 @@ public class PrepareProjectTest {
|
|||
.textFile(workingDir.toString() + "/preparedProjects")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, CSVProject.class));
|
||||
|
||||
Assertions.assertEquals(20, tmp.count());
|
||||
Assertions.assertEquals(8, tmp.count());
|
||||
|
||||
Dataset<CSVProject> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(CSVProject.class));
|
||||
|
||||
|
|
|
@ -5,7 +5,6 @@ import java.io.IOException;
|
|||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -21,6 +20,7 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
|
||||
public class SparkUpdateProjectTest {
|
||||
|
@ -86,13 +86,10 @@ public class SparkUpdateProjectTest {
|
|||
JavaRDD<Project> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Project)aa.getPayload()))
|
||||
;
|
||||
.map(aa -> ((Project) aa.getPayload()));
|
||||
|
||||
Assertions.assertEquals(14, tmp.count());
|
||||
|
||||
// Dataset<CSVProgramme> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(CSVProgramme.class));
|
||||
//
|
||||
// Assertions.assertEquals(0, verificationDataset.filter("shortTitle =''").count());
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
{"code":"894593"}
|
||||
{"code":"897004"}
|
||||
{"code":"896300"}
|
||||
{"code":"892890"}
|
||||
{"code":"886828"}
|
||||
{"code":"8867767"}
|
||||
{"code":"101003374"}
|
||||
{"code":"886776"}
|
Loading…
Reference in New Issue